pvanand commited on
Commit
47473dd
·
verified ·
1 Parent(s): 00eb6f3

Update helper_functions_api.py

Browse files
Files changed (1) hide show
  1. helper_functions_api.py +49 -37
helper_functions_api.py CHANGED
@@ -4,6 +4,7 @@ from mistune.plugins.table import table
4
  from jinja2 import Template
5
  import re
6
  import os
 
7
 
8
  def md_to_html(md_text):
9
  renderer = mistune.HTMLRenderer()
@@ -70,7 +71,16 @@ from together import Together
70
  llm_default_small = "meta-llama/Llama-3-8b-chat-hf"
71
  llm_default_medium = "meta-llama/Llama-3-70b-chat-hf"
72
 
73
- SysPromptData = "You are an information retriever and summarizer, return only the factual information regarding the user query"
 
 
 
 
 
 
 
 
 
74
  SysPromptDefault = "You are an expert AI, complete the given task. Do not add any additional comments."
75
  SysPromptSearch = """You are a search query generator, create a concise Google search query, focusing only on the main topic and omitting additional redundant details, include year if necessory, 2024, Do not add any additional comments. OUTPUT ONLY THE SEARCH QUERY
76
  #Additional instructions:
@@ -152,11 +162,11 @@ def remove_stopwords(text):
152
  def rephrase_content(data_format, content, query):
153
 
154
  if data_format == "Structured data":
155
- return together_response(
156
- f"return only the factual information regarding the query: {{{query}}}. Output should be concise chunks of \
157
- paragraphs or tables or both, using the scraped context:{{{limit_tokens(content)}}}",
158
  SysPrompt=SysPromptData,
159
- max_tokens=500,
160
  )
161
  elif data_format == "Quantitative data":
162
  return together_response(
@@ -171,42 +181,44 @@ def rephrase_content(data_format, content, query):
171
  max_tokens=500,
172
  )
173
 
174
- class Scraper:
175
- def __init__(self, user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"):
176
- self.session = requests.Session()
177
- self.session.headers.update({"User-Agent": user_agent})
178
-
179
- @retry(tries=3, delay=1)
180
- def fetch_content(self, url):
181
  try:
182
- response = self.session.get(url, timeout=2)
183
- if response.status_code == 200:
184
- return response.text
185
- except requests.exceptions.RequestException as e:
186
- print(f"Error fetching page content for {url}: {e}")
187
- return None
188
-
189
- def extract_main_content(html):
190
- if html:
191
- plain_text = ""
192
- soup = BeautifulSoup(html, 'lxml')
193
- for element in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'table']):
194
- plain_text += element.get_text(separator=" ", strip=True) + "\n"
195
- return plain_text
 
 
 
 
 
 
 
 
 
 
196
  return ""
197
 
198
  def process_content(data_format, url, query):
199
- scraper = Scraper()
200
- html_content = scraper.fetch_content(url)
201
- if html_content:
202
- content = extract_main_content(html_content)
203
- if content:
204
- rephrased_content = rephrase_content(
205
- data_format=data_format,
206
- content=limit_tokens(remove_stopwords(content), token_limit=1000),
207
- query=query,
208
- )
209
- return rephrased_content, url
210
  return "", url
211
 
212
  def fetch_and_extract_content(data_format, urls, query):
 
4
  from jinja2 import Template
5
  import re
6
  import os
7
+ from urllib.parse import urlparse
8
 
9
  def md_to_html(md_text):
10
  renderer = mistune.HTMLRenderer()
 
71
  llm_default_small = "meta-llama/Llama-3-8b-chat-hf"
72
  llm_default_medium = "meta-llama/Llama-3-70b-chat-hf"
73
 
74
+ SysPromptData = """You are expert in information extraction from the given context.
75
+ Steps to follow:
76
+ 1. Check if relevant factual data regarding <USER QUERY> is present in the <SCRAPED DATA>.
77
+ - IF YES, extract the maximum relevant factual information related to <USER QUERY> from the <SCRAPED DATA>.
78
+ - IF NO, then return "N/A"
79
+
80
+ Rules to follow:
81
+ - Return N/A if information is not present in the scraped data.
82
+ - FORGET EVERYTHING YOU KNOW, Only output information that is present in the scraped data, DO NOT MAKE UP INFORMATION
83
+ """
84
  SysPromptDefault = "You are an expert AI, complete the given task. Do not add any additional comments."
85
  SysPromptSearch = """You are a search query generator, create a concise Google search query, focusing only on the main topic and omitting additional redundant details, include year if necessory, 2024, Do not add any additional comments. OUTPUT ONLY THE SEARCH QUERY
86
  #Additional instructions:
 
162
  def rephrase_content(data_format, content, query):
163
 
164
  if data_format == "Structured data":
165
+ return together_response(f"""
166
+ <SCRAPED DATA>{content}</SCRAPED DATA>
167
+ extract the maximum relevant factual information covering all aspects of <USER QUERY>{query}</USER QUERY> ONLY IF AVAILABLE in the scraped data.""",
168
  SysPrompt=SysPromptData,
169
+ max_tokens=900,
170
  )
171
  elif data_format == "Quantitative data":
172
  return together_response(
 
181
  max_tokens=500,
182
  )
183
 
184
+ def extract_main_content(url):
185
+ if url:
 
 
 
 
 
186
  try:
187
+ result = urlparse(url)
188
+ if all([result.scheme, result.netloc]):
189
+ # Prepare query parameters
190
+ params = {
191
+ "url": url,
192
+ "favor_precision": False,
193
+ "favor_recall": False,
194
+ "output_format": "markdown",
195
+ "target_language": "en",
196
+ "include_tables": True,
197
+ "include_images": False,
198
+ "include_links": False,
199
+ "deduplicate": True,
200
+ }
201
+
202
+ # Make request to FastAPI endpoint
203
+ response = requests.get("https://pvanand-web-scraping.hf.space/extract-article", params=params)
204
+
205
+ if response.status_code == 200:
206
+ return response.json()["article"]
207
+ else:
208
+ return ""
209
+ except:
210
+ return ""
211
  return ""
212
 
213
  def process_content(data_format, url, query):
214
+ content = extract_main_content(url)
215
+ if content:
216
+ rephrased_content = rephrase_content(
217
+ data_format=data_format,
218
+ content=limit_tokens(content, token_limit=4000),
219
+ query=query,
220
+ )
221
+ return rephrased_content, url
 
 
 
222
  return "", url
223
 
224
  def fetch_and_extract_content(data_format, urls, query):