| import re |
|
|
| import gradio as gr |
| import requests |
| from inscriptis import get_text |
| from inscriptis.css_profiles import CSS_PROFILES |
| from inscriptis.model.config import ParserConfig |
| from readability import Document |
|
|
| INSCRIPTIS_CONFIG = ParserConfig(css=CSS_PROFILES["strict"]) |
|
|
|
|
| def extract_text(url: str): |
| html = requests.get(url).content.decode("utf-8") |
|
|
| if len(html.strip()) == 0: |
| return "", "", "", "" |
|
|
| parsed_doc = Document(html) |
|
|
| |
| title = parsed_doc.short_title() |
| clean_html = parsed_doc.summary(html_partial=True) |
| del parsed_doc |
|
|
| |
| text = get_text(clean_html, INSCRIPTIS_CONFIG).strip() |
|
|
| if not re.search(r"\w+", text): |
| |
| return title, "", clean_html, html |
|
|
| |
| text = re.sub(r"\n\s*\n", "\n\n", text) |
|
|
| return title, text, clean_html, html |
|
|
|
|
| title = gr.Textbox(label="Title") |
| text = gr.Textbox(label="Text (`inscriptis` output)", lines=10) |
| clean_html = gr.Textbox(label="Clean HTML (`readability-lxml` output)", lines=10) |
| html = gr.Textbox(label="Raw HTML response", lines=10) |
| demo = gr.Interface( |
| extract_text, |
| gr.Textbox(placeholder="https://hf.co/", label="URL"), |
| [title, text, clean_html, html], |
| examples=[ |
| ["https://huggingface.co/blog/peft"], |
| [ |
| "https://www.nytimes.com/2023/03/08/technology/chatbots-disrupt-internet-industry.html" |
| ], |
| ], |
| ) |
|
|
| demo.launch() |
|
|