shubhendu-ghosh-DS commited on
Commit
193bf75
1 Parent(s): 74c6795

summarized the google searches

Browse files
app.py CHANGED
@@ -1,43 +1,25 @@
1
  import gradio as gr
2
- from googlesearch import search
3
- from bs4 import BeautifulSoup
4
- import requests
5
 
6
 
7
- def google_search(query, num_results=5):
8
- search_results = search(query, num_results=num_results)
9
- return search_results
10
-
11
- def scrape_text_from_url(url):
12
- try:
13
- response = requests.get(url)
14
- soup = BeautifulSoup(response.text, 'html.parser')
15
-
16
- # Remove specific elements (customize as needed)
17
- unwanted_elements = ['footer', 'script', 'style', 'noscript']
18
- for tag in unwanted_elements:
19
- for el in soup.find_all(tag):
20
- el.extract()
21
-
22
- # Extract text from remaining paragraphs
23
- text = ' '.join([p.text for p in soup.find_all('p')])
24
-
25
- return text.strip() # Strip leading and trailing whitespaces
26
- except Exception as e:
27
- print(f"Error scraping {url}: {e}")
28
- return None
29
 
30
  def get_google_data(search_term):
31
- whole_result = ''
32
- search_results = google_search(search_term)
33
  for i, result in enumerate(search_results, start=1):
34
- text = scrape_text_from_url(result)
 
35
  if text:
36
- whole_result += text
 
 
 
37
 
38
- return whole_result
39
 
40
 
41
  iface = gr.Interface(fn=get_google_data, inputs="text", outputs="text")
42
 
43
- iface.launch(share=True)
 
1
  import gradio as gr
2
+ from google_search_service.search_google import GoogleSearchService
3
+ from summerizer_service.summarize_text import Summarizer
4
+ from web_scraping_service.beautiful_scrape import WebScrapingService
5
 
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  def get_google_data(search_term):
9
+ summaries = ''
10
+ search_results = GoogleSearchService().google_search(search_term)
11
  for i, result in enumerate(search_results, start=1):
12
+ text = WebScrapingService().scrape_text_from_url(result)
13
+
14
  if text:
15
+ summary = Summarizer().summarize_text(text)
16
+ print("summary is -----", summary)
17
+ summaries += summary
18
+ search_result = Summarizer().summarize_text(summaries)
19
 
20
+ return search_result
21
 
22
 
23
  iface = gr.Interface(fn=get_google_data, inputs="text", outputs="text")
24
 
25
+ iface.launch()
google_search_service/__pycache__/search_google.cpython-38.pyc ADDED
Binary file (737 Bytes). View file
 
google_search_service/search_google.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from googlesearch import search
2
+
3
+ class GoogleSearchService:
4
+
5
+ def __init__(self):
6
+ pass
7
+
8
+ def google_search(self, query, num_results=5):
9
+ search_results = search(query, num_results=num_results)
10
+ return search_results
requirements.txt CHANGED
@@ -1,9 +1,12 @@
 
1
  aiofiles==23.2.1
2
  altair==5.2.0
3
  annotated-types==0.6.0
4
  anyio==4.2.0
 
5
  attrs==23.1.0
6
  beautifulsoup4==4.12.2
 
7
  certifi==2023.11.17
8
  charset-normalizer==3.3.2
9
  click==8.1.7
@@ -14,31 +17,47 @@ exceptiongroup==1.2.0
14
  fastapi==0.108.0
15
  ffmpy==0.3.1
16
  filelock==3.13.1
 
17
  fonttools==4.47.0
18
  fsspec==2023.12.2
 
 
 
 
19
  googlesearch-python==1.2.3
20
  gradio==4.12.0
21
  gradio-client==0.8.0
 
22
  h11==0.14.0
 
23
  httpcore==1.0.2
24
  httpx==0.26.0
25
  huggingface-hub==0.20.1
26
  idna==3.6
 
27
  importlib-resources==6.1.1
28
  Jinja2==3.1.2
29
  jsonschema==4.20.0
30
  jsonschema-specifications==2023.12.1
 
31
  kiwisolver==1.4.5
 
 
32
  markdown-it-py==3.0.0
33
  MarkupSafe==2.1.3
34
  matplotlib==3.7.4
35
  mdurl==0.1.2
36
- numpy==1.24.4
 
 
37
  orjson==3.9.10
38
  packaging==23.2
39
  pandas==2.0.3
40
  Pillow==10.1.0
41
  pkgutil-resolve-name==1.3.10
 
 
 
42
  pydantic==2.5.3
43
  pydantic-core==2.14.6
44
  pydub==0.25.1
@@ -49,22 +68,37 @@ python-multipart==0.0.6
49
  pytz==2023.3.post1
50
  PyYAML==6.0.1
51
  referencing==0.32.0
 
52
  requests==2.31.0
 
53
  rich==13.7.0
54
  rpds-py==0.16.2
 
 
55
  semantic-version==2.10.0
56
  shellingham==1.5.4
57
  six==1.16.0
58
  sniffio==1.3.0
59
  soupsieve==2.5
60
  starlette==0.32.0.post1
 
 
 
 
 
 
 
 
61
  tomlkit==0.12.0
62
  toolz==0.12.0
63
  tqdm==4.66.1
 
64
  typer==0.9.0
65
  typing-extensions==4.9.0
66
  tzdata==2023.4
67
  urllib3==2.1.0
68
  uvicorn==0.25.0
69
  websockets==11.0.3
 
 
70
  zipp==3.17.0
 
1
+ absl-py==2.0.0
2
  aiofiles==23.2.1
3
  altair==5.2.0
4
  annotated-types==0.6.0
5
  anyio==4.2.0
6
+ astunparse==1.6.3
7
  attrs==23.1.0
8
  beautifulsoup4==4.12.2
9
+ cachetools==5.3.2
10
  certifi==2023.11.17
11
  charset-normalizer==3.3.2
12
  click==8.1.7
 
17
  fastapi==0.108.0
18
  ffmpy==0.3.1
19
  filelock==3.13.1
20
+ flatbuffers==23.5.26
21
  fonttools==4.47.0
22
  fsspec==2023.12.2
23
+ gast==0.4.0
24
+ google-auth==2.26.1
25
+ google-auth-oauthlib==1.0.0
26
+ google-pasta==0.2.0
27
  googlesearch-python==1.2.3
28
  gradio==4.12.0
29
  gradio-client==0.8.0
30
+ grpcio==1.60.0
31
  h11==0.14.0
32
+ h5py==3.10.0
33
  httpcore==1.0.2
34
  httpx==0.26.0
35
  huggingface-hub==0.20.1
36
  idna==3.6
37
+ importlib-metadata==7.0.1
38
  importlib-resources==6.1.1
39
  Jinja2==3.1.2
40
  jsonschema==4.20.0
41
  jsonschema-specifications==2023.12.1
42
+ keras==2.13.1
43
  kiwisolver==1.4.5
44
+ libclang==16.0.6
45
+ Markdown==3.5.1
46
  markdown-it-py==3.0.0
47
  MarkupSafe==2.1.3
48
  matplotlib==3.7.4
49
  mdurl==0.1.2
50
+ numpy==1.24.3
51
+ oauthlib==3.2.2
52
+ opt-einsum==3.3.0
53
  orjson==3.9.10
54
  packaging==23.2
55
  pandas==2.0.3
56
  Pillow==10.1.0
57
  pkgutil-resolve-name==1.3.10
58
+ protobuf==4.25.1
59
+ pyasn1==0.5.1
60
+ pyasn1-modules==0.3.0
61
  pydantic==2.5.3
62
  pydantic-core==2.14.6
63
  pydub==0.25.1
 
68
  pytz==2023.3.post1
69
  PyYAML==6.0.1
70
  referencing==0.32.0
71
+ regex==2023.12.25
72
  requests==2.31.0
73
+ requests-oauthlib==1.3.1
74
  rich==13.7.0
75
  rpds-py==0.16.2
76
+ rsa==4.9
77
+ safetensors==0.4.1
78
  semantic-version==2.10.0
79
  shellingham==1.5.4
80
  six==1.16.0
81
  sniffio==1.3.0
82
  soupsieve==2.5
83
  starlette==0.32.0.post1
84
+ tensorboard==2.13.0
85
+ tensorboard-data-server==0.7.2
86
+ tensorflow==2.13.0
87
+ tensorflow-estimator==2.13.0
88
+ tensorflow-intel==2.13.0
89
+ tensorflow-io-gcs-filesystem==0.31.0
90
+ termcolor==2.4.0
91
+ tokenizers==0.15.0
92
  tomlkit==0.12.0
93
  toolz==0.12.0
94
  tqdm==4.66.1
95
+ transformers==4.36.2
96
  typer==0.9.0
97
  typing-extensions==4.9.0
98
  tzdata==2023.4
99
  urllib3==2.1.0
100
  uvicorn==0.25.0
101
  websockets==11.0.3
102
+ werkzeug==3.0.1
103
+ wrapt==1.16.0
104
  zipp==3.17.0
summerizer_service/__pycache__/summarize_text.cpython-38.pyc ADDED
Binary file (824 Bytes). View file
 
summerizer_service/summarize_text.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline
2
+
3
+
4
+
5
+ class Summarizer:
6
+
7
+ def __init__(self):
8
+ pass
9
+
10
+ def summarize_text(self, text):
11
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
12
+ summary = summarizer(text, max_length=500, min_length=100, do_sample=False)
13
+ return summary[0]['summary_text']
web_scraping_service/__pycache__/beautiful_scrape.cpython-38.pyc ADDED
Binary file (1.27 kB). View file
 
web_scraping_service/beautiful_scrape.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from bs4 import BeautifulSoup
2
+ import requests
3
+ from requests.exceptions import HTTPError
4
+
5
+ class WebScrapingService:
6
+
7
+ def __init__(self):
8
+ pass
9
+
10
+ def scrape_text_from_url(self, url):
11
+ try:
12
+ response = requests.get(url)
13
+ soup = BeautifulSoup(response.text, 'html.parser')
14
+
15
+ unwanted_elements = ['footer', 'script', 'style', 'noscript']
16
+ for tag in unwanted_elements:
17
+ for el in soup.find_all(tag):
18
+ el.extract()
19
+
20
+ text = ' '.join([p.text for p in soup.find_all('p')])
21
+
22
+ return text.strip() # Strip leading and trailing whitespaces
23
+ except Exception as e:
24
+ raise HTTPError(e)