Spaces:
Sleeping
Sleeping
shubhendu-ghosh-DS
commited on
Commit
•
193bf75
1
Parent(s):
74c6795
summarized the google searches
Browse files- app.py +13 -31
- google_search_service/__pycache__/search_google.cpython-38.pyc +0 -0
- google_search_service/search_google.py +10 -0
- requirements.txt +35 -1
- summerizer_service/__pycache__/summarize_text.cpython-38.pyc +0 -0
- summerizer_service/summarize_text.py +13 -0
- web_scraping_service/__pycache__/beautiful_scrape.cpython-38.pyc +0 -0
- web_scraping_service/beautiful_scrape.py +24 -0
app.py
CHANGED
@@ -1,43 +1,25 @@
|
|
1 |
import gradio as gr
|
2 |
-
from
|
3 |
-
from
|
4 |
-
import
|
5 |
|
6 |
|
7 |
-
def google_search(query, num_results=5):
|
8 |
-
search_results = search(query, num_results=num_results)
|
9 |
-
return search_results
|
10 |
-
|
11 |
-
def scrape_text_from_url(url):
|
12 |
-
try:
|
13 |
-
response = requests.get(url)
|
14 |
-
soup = BeautifulSoup(response.text, 'html.parser')
|
15 |
-
|
16 |
-
# Remove specific elements (customize as needed)
|
17 |
-
unwanted_elements = ['footer', 'script', 'style', 'noscript']
|
18 |
-
for tag in unwanted_elements:
|
19 |
-
for el in soup.find_all(tag):
|
20 |
-
el.extract()
|
21 |
-
|
22 |
-
# Extract text from remaining paragraphs
|
23 |
-
text = ' '.join([p.text for p in soup.find_all('p')])
|
24 |
-
|
25 |
-
return text.strip() # Strip leading and trailing whitespaces
|
26 |
-
except Exception as e:
|
27 |
-
print(f"Error scraping {url}: {e}")
|
28 |
-
return None
|
29 |
|
30 |
def get_google_data(search_term):
|
31 |
-
|
32 |
-
search_results = google_search(search_term)
|
33 |
for i, result in enumerate(search_results, start=1):
|
34 |
-
text = scrape_text_from_url(result)
|
|
|
35 |
if text:
|
36 |
-
|
|
|
|
|
|
|
37 |
|
38 |
-
return
|
39 |
|
40 |
|
41 |
iface = gr.Interface(fn=get_google_data, inputs="text", outputs="text")
|
42 |
|
43 |
-
iface.launch(
|
|
|
1 |
import gradio as gr
|
2 |
+
from google_search_service.search_google import GoogleSearchService
|
3 |
+
from summerizer_service.summarize_text import Summarizer
|
4 |
+
from web_scraping_service.beautiful_scrape import WebScrapingService
|
5 |
|
6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
def get_google_data(search_term):
|
9 |
+
summaries = ''
|
10 |
+
search_results = GoogleSearchService().google_search(search_term)
|
11 |
for i, result in enumerate(search_results, start=1):
|
12 |
+
text = WebScrapingService().scrape_text_from_url(result)
|
13 |
+
|
14 |
if text:
|
15 |
+
summary = Summarizer().summarize_text(text)
|
16 |
+
print("summary is -----", summary)
|
17 |
+
summaries += summary
|
18 |
+
search_result = Summarizer().summarize_text(summaries)
|
19 |
|
20 |
+
return search_result
|
21 |
|
22 |
|
23 |
iface = gr.Interface(fn=get_google_data, inputs="text", outputs="text")
|
24 |
|
25 |
+
iface.launch()
|
google_search_service/__pycache__/search_google.cpython-38.pyc
ADDED
Binary file (737 Bytes). View file
|
|
google_search_service/search_google.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from googlesearch import search
|
2 |
+
|
3 |
+
class GoogleSearchService:
|
4 |
+
|
5 |
+
def __init__(self):
|
6 |
+
pass
|
7 |
+
|
8 |
+
def google_search(self, query, num_results=5):
|
9 |
+
search_results = search(query, num_results=num_results)
|
10 |
+
return search_results
|
requirements.txt
CHANGED
@@ -1,9 +1,12 @@
|
|
|
|
1 |
aiofiles==23.2.1
|
2 |
altair==5.2.0
|
3 |
annotated-types==0.6.0
|
4 |
anyio==4.2.0
|
|
|
5 |
attrs==23.1.0
|
6 |
beautifulsoup4==4.12.2
|
|
|
7 |
certifi==2023.11.17
|
8 |
charset-normalizer==3.3.2
|
9 |
click==8.1.7
|
@@ -14,31 +17,47 @@ exceptiongroup==1.2.0
|
|
14 |
fastapi==0.108.0
|
15 |
ffmpy==0.3.1
|
16 |
filelock==3.13.1
|
|
|
17 |
fonttools==4.47.0
|
18 |
fsspec==2023.12.2
|
|
|
|
|
|
|
|
|
19 |
googlesearch-python==1.2.3
|
20 |
gradio==4.12.0
|
21 |
gradio-client==0.8.0
|
|
|
22 |
h11==0.14.0
|
|
|
23 |
httpcore==1.0.2
|
24 |
httpx==0.26.0
|
25 |
huggingface-hub==0.20.1
|
26 |
idna==3.6
|
|
|
27 |
importlib-resources==6.1.1
|
28 |
Jinja2==3.1.2
|
29 |
jsonschema==4.20.0
|
30 |
jsonschema-specifications==2023.12.1
|
|
|
31 |
kiwisolver==1.4.5
|
|
|
|
|
32 |
markdown-it-py==3.0.0
|
33 |
MarkupSafe==2.1.3
|
34 |
matplotlib==3.7.4
|
35 |
mdurl==0.1.2
|
36 |
-
numpy==1.24.
|
|
|
|
|
37 |
orjson==3.9.10
|
38 |
packaging==23.2
|
39 |
pandas==2.0.3
|
40 |
Pillow==10.1.0
|
41 |
pkgutil-resolve-name==1.3.10
|
|
|
|
|
|
|
42 |
pydantic==2.5.3
|
43 |
pydantic-core==2.14.6
|
44 |
pydub==0.25.1
|
@@ -49,22 +68,37 @@ python-multipart==0.0.6
|
|
49 |
pytz==2023.3.post1
|
50 |
PyYAML==6.0.1
|
51 |
referencing==0.32.0
|
|
|
52 |
requests==2.31.0
|
|
|
53 |
rich==13.7.0
|
54 |
rpds-py==0.16.2
|
|
|
|
|
55 |
semantic-version==2.10.0
|
56 |
shellingham==1.5.4
|
57 |
six==1.16.0
|
58 |
sniffio==1.3.0
|
59 |
soupsieve==2.5
|
60 |
starlette==0.32.0.post1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
tomlkit==0.12.0
|
62 |
toolz==0.12.0
|
63 |
tqdm==4.66.1
|
|
|
64 |
typer==0.9.0
|
65 |
typing-extensions==4.9.0
|
66 |
tzdata==2023.4
|
67 |
urllib3==2.1.0
|
68 |
uvicorn==0.25.0
|
69 |
websockets==11.0.3
|
|
|
|
|
70 |
zipp==3.17.0
|
|
|
1 |
+
absl-py==2.0.0
|
2 |
aiofiles==23.2.1
|
3 |
altair==5.2.0
|
4 |
annotated-types==0.6.0
|
5 |
anyio==4.2.0
|
6 |
+
astunparse==1.6.3
|
7 |
attrs==23.1.0
|
8 |
beautifulsoup4==4.12.2
|
9 |
+
cachetools==5.3.2
|
10 |
certifi==2023.11.17
|
11 |
charset-normalizer==3.3.2
|
12 |
click==8.1.7
|
|
|
17 |
fastapi==0.108.0
|
18 |
ffmpy==0.3.1
|
19 |
filelock==3.13.1
|
20 |
+
flatbuffers==23.5.26
|
21 |
fonttools==4.47.0
|
22 |
fsspec==2023.12.2
|
23 |
+
gast==0.4.0
|
24 |
+
google-auth==2.26.1
|
25 |
+
google-auth-oauthlib==1.0.0
|
26 |
+
google-pasta==0.2.0
|
27 |
googlesearch-python==1.2.3
|
28 |
gradio==4.12.0
|
29 |
gradio-client==0.8.0
|
30 |
+
grpcio==1.60.0
|
31 |
h11==0.14.0
|
32 |
+
h5py==3.10.0
|
33 |
httpcore==1.0.2
|
34 |
httpx==0.26.0
|
35 |
huggingface-hub==0.20.1
|
36 |
idna==3.6
|
37 |
+
importlib-metadata==7.0.1
|
38 |
importlib-resources==6.1.1
|
39 |
Jinja2==3.1.2
|
40 |
jsonschema==4.20.0
|
41 |
jsonschema-specifications==2023.12.1
|
42 |
+
keras==2.13.1
|
43 |
kiwisolver==1.4.5
|
44 |
+
libclang==16.0.6
|
45 |
+
Markdown==3.5.1
|
46 |
markdown-it-py==3.0.0
|
47 |
MarkupSafe==2.1.3
|
48 |
matplotlib==3.7.4
|
49 |
mdurl==0.1.2
|
50 |
+
numpy==1.24.3
|
51 |
+
oauthlib==3.2.2
|
52 |
+
opt-einsum==3.3.0
|
53 |
orjson==3.9.10
|
54 |
packaging==23.2
|
55 |
pandas==2.0.3
|
56 |
Pillow==10.1.0
|
57 |
pkgutil-resolve-name==1.3.10
|
58 |
+
protobuf==4.25.1
|
59 |
+
pyasn1==0.5.1
|
60 |
+
pyasn1-modules==0.3.0
|
61 |
pydantic==2.5.3
|
62 |
pydantic-core==2.14.6
|
63 |
pydub==0.25.1
|
|
|
68 |
pytz==2023.3.post1
|
69 |
PyYAML==6.0.1
|
70 |
referencing==0.32.0
|
71 |
+
regex==2023.12.25
|
72 |
requests==2.31.0
|
73 |
+
requests-oauthlib==1.3.1
|
74 |
rich==13.7.0
|
75 |
rpds-py==0.16.2
|
76 |
+
rsa==4.9
|
77 |
+
safetensors==0.4.1
|
78 |
semantic-version==2.10.0
|
79 |
shellingham==1.5.4
|
80 |
six==1.16.0
|
81 |
sniffio==1.3.0
|
82 |
soupsieve==2.5
|
83 |
starlette==0.32.0.post1
|
84 |
+
tensorboard==2.13.0
|
85 |
+
tensorboard-data-server==0.7.2
|
86 |
+
tensorflow==2.13.0
|
87 |
+
tensorflow-estimator==2.13.0
|
88 |
+
tensorflow-intel==2.13.0
|
89 |
+
tensorflow-io-gcs-filesystem==0.31.0
|
90 |
+
termcolor==2.4.0
|
91 |
+
tokenizers==0.15.0
|
92 |
tomlkit==0.12.0
|
93 |
toolz==0.12.0
|
94 |
tqdm==4.66.1
|
95 |
+
transformers==4.36.2
|
96 |
typer==0.9.0
|
97 |
typing-extensions==4.9.0
|
98 |
tzdata==2023.4
|
99 |
urllib3==2.1.0
|
100 |
uvicorn==0.25.0
|
101 |
websockets==11.0.3
|
102 |
+
werkzeug==3.0.1
|
103 |
+
wrapt==1.16.0
|
104 |
zipp==3.17.0
|
summerizer_service/__pycache__/summarize_text.cpython-38.pyc
ADDED
Binary file (824 Bytes). View file
|
|
summerizer_service/summarize_text.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import pipeline
|
2 |
+
|
3 |
+
|
4 |
+
|
5 |
+
class Summarizer:
|
6 |
+
|
7 |
+
def __init__(self):
|
8 |
+
pass
|
9 |
+
|
10 |
+
def summarize_text(self, text):
|
11 |
+
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
|
12 |
+
summary = summarizer(text, max_length=500, min_length=100, do_sample=False)
|
13 |
+
return summary[0]['summary_text']
|
web_scraping_service/__pycache__/beautiful_scrape.cpython-38.pyc
ADDED
Binary file (1.27 kB). View file
|
|
web_scraping_service/beautiful_scrape.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from bs4 import BeautifulSoup
|
2 |
+
import requests
|
3 |
+
from requests.exceptions import HTTPError
|
4 |
+
|
5 |
+
class WebScrapingService:
|
6 |
+
|
7 |
+
def __init__(self):
|
8 |
+
pass
|
9 |
+
|
10 |
+
def scrape_text_from_url(self, url):
|
11 |
+
try:
|
12 |
+
response = requests.get(url)
|
13 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
14 |
+
|
15 |
+
unwanted_elements = ['footer', 'script', 'style', 'noscript']
|
16 |
+
for tag in unwanted_elements:
|
17 |
+
for el in soup.find_all(tag):
|
18 |
+
el.extract()
|
19 |
+
|
20 |
+
text = ' '.join([p.text for p in soup.find_all('p')])
|
21 |
+
|
22 |
+
return text.strip() # Strip leading and trailing whitespaces
|
23 |
+
except Exception as e:
|
24 |
+
raise HTTPError(e)
|