Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# coding: utf-8
|
3 |
+
|
4 |
+
import spacy
|
5 |
+
from urllib.request import urlopen, Request
|
6 |
+
from bs4 import BeautifulSoup
|
7 |
+
import gradio as gr
|
8 |
+
|
9 |
+
# Ensure spaCy model is available
|
10 |
+
try:
|
11 |
+
nlp = spacy.load("en_core_web_sm")
|
12 |
+
except OSError:
|
13 |
+
from spacy.cli import download
|
14 |
+
download("en_core_web_sm")
|
15 |
+
nlp = spacy.load("en_core_web_sm")
|
16 |
+
|
17 |
+
def extract_text(url):
|
18 |
+
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
|
19 |
+
html = urlopen(req).read()
|
20 |
+
text = ' '.join(BeautifulSoup(html, "html.parser").stripped_strings)
|
21 |
+
return text
|
22 |
+
|
23 |
+
def extract_details(text):
|
24 |
+
# Process text with spaCy
|
25 |
+
doc = nlp(text)
|
26 |
+
|
27 |
+
# Extract potential entities
|
28 |
+
names = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
|
29 |
+
dates = [ent.text for ent in doc.ents if ent.label_ == "DATE"]
|
30 |
+
|
31 |
+
# Simple heuristic for categorizing information
|
32 |
+
husband_name = next((name for name in names if "husband" in text.lower()), "Not Found")
|
33 |
+
children = [name for name in names if "child" in text.lower() or "children" in text.lower()]
|
34 |
+
marriage_date = next((date for date in dates if "marriage" in text.lower()), "Not Found")
|
35 |
+
grandchildren = [name for name in names if "grandchild" in text.lower() or "grandchildren" in text.lower()]
|
36 |
+
greatgrandchildren = [name for name in names if "great-grandchild" in text.lower() or "great-grandchildren" in text.lower()]
|
37 |
+
|
38 |
+
# Construct the table
|
39 |
+
table = f"""
|
40 |
+
| Name | Birthday | Husband Name | Children | Marriage Date | Grandchildren | Great-grandchildren |
|
41 |
+
|-----------------|---------------|----------------|------------------|-----------------|-----------------------|-----------------------|
|
42 |
+
| {', '.join(names[:1])} | {', '.join(dates[:1])} | {husband_name} | {', '.join(children)} | {marriage_date} | {', '.join(grandchildren)} | {', '.join(greatgrandchildren)} |
|
43 |
+
"""
|
44 |
+
return table
|
45 |
+
|
46 |
+
def create_table(url):
|
47 |
+
text = extract_text(url)
|
48 |
+
return extract_details(text)
|
49 |
+
|
50 |
+
demo = gr.Interface(fn=create_table, inputs="text", outputs="text")
|
51 |
+
|
52 |
+
if __name__ == "__main__":
|
53 |
+
demo.launch(show_api=False)
|