code-execution

Sleeping

App Files Files Community

code-execution / helper_functions_api.py

pvanand

Update helper_functions_api.py

b1fa23d verified 6 months ago

raw

history blame

10 kB

	# !pip install mistune
	import mistune
	from mistune.plugins.table import table
	from jinja2 import Template
	import re
	import os

	def md_to_html(md_text):
	renderer = mistune.HTMLRenderer()
	markdown_renderer = mistune.Markdown(renderer, plugins=[table])
	html_content = markdown_renderer(md_text)
	return html_content.replace('\n', '')

	####------------------------------ OPTIONAL--> User id and persistant data storage-------------------------------------####
	from datetime import datetime
	import psycopg2

	from dotenv import load_dotenv, find_dotenv

	# Load environment variables from .env file
	load_dotenv("keys.env")

	TOGETHER_API_KEY = os.getenv('TOGETHER_API_KEY')
	BRAVE_API_KEY = os.getenv('BRAVE_API_KEY')
	GROQ_API_KEY = os.getenv("GROQ_API_KEY")
	HELICON_API_KEY = os.getenv("HELICON_API_KEY")
	SUPABASE_USER = os.environ['SUPABASE_USER']
	SUPABASE_PASSWORD = os.environ['SUPABASE_PASSWORD']

	def insert_data(user_id, user_query, subtopic_query, response, html_report):
	# Connect to your database
	conn = psycopg2.connect(
	dbname="postgres",
	user=SUPABASE_USER,
	password=SUPABASE_PASSWORD,
	host="aws-0-us-west-1.pooler.supabase.com",
	port="5432"
	)
	cur = conn.cursor()
	insert_query = """
	INSERT INTO research_pro_chat_v2 (user_id, user_query, subtopic_query, response, html_report, created_at)
	VALUES (%s, %s, %s, %s, %s, %s);
	"""
	cur.execute(insert_query, (user_id,user_query, subtopic_query, response, html_report, datetime.now()))
	conn.commit()
	cur.close()
	conn.close()

	####-----------------------------------------------------END----------------------------------------------------------####


	import ast
	from fpdf import FPDF
	import re
	import pandas as pd
	import nltk
	nltk.download('stopwords')
	nltk.download('punkt')
	import requests
	import json
	from retry import retry
	from concurrent.futures import ThreadPoolExecutor, as_completed
	from bs4 import BeautifulSoup
	from nltk.corpus import stopwords
	from nltk.tokenize import word_tokenize
	from brave import Brave
	from together import Together
	from langchain_core.output_parsers import JsonOutputParser
	from fuzzy_json import loads
	from half_json.core import JSONFixer
	from openai import OpenAI

	llm_default_small = "llama3-8b-8192"
	llm_default_medium = "llama3-70b-8192"

	SysPromptJson = "You are now in the role of an expert AI who can extract structured information from user request. Both key and value pairs must be in double quotes. You must respond ONLY with a valid JSON file. Do not add any additional comments."
	SysPromptList = "You are now in the role of an expert AI who can extract structured information from user request. All elements must be in double quotes. You must respond ONLY with a valid python List. Do not add any additional comments."
	SysPromptDefault = "You are an expert AI, complete the given task. Do not add any additional comments."

	import tiktoken # Used to limit tokens
	encoding = tiktoken.encoding_for_model("gpt-3.5-turbo") # Instead of Llama3 using available option/ replace if found anything better

	def limit_tokens(input_string, token_limit=8000):
	"""
	Limit tokens sent to the model
	"""
	return encoding.decode(encoding.encode(input_string)[:token_limit])

	def together_response(message, model=llm_default_small, SysPrompt = SysPromptDefault,temperature=0.2):

	client = OpenAI(
	api_key=GROQ_API_KEY,
	base_url="https://gateway.hconeai.com/openai/v1",
	default_headers={
	"Helicone-Auth": f"Bearer {HELICON_API_KEY}",
	"Helicone-Target-Url": "https://api.groq.com"
	}
	)

	messages=[{"role": "system", "content": SysPrompt},{"role": "user", "content": message}]

	response = client.chat.completions.create(
	model=model,
	messages=messages,
	temperature=temperature,
	)
	return response.choices[0].message.content


	def json_from_text(text):
	"""
	Extracts JSON from text using regex and fuzzy JSON loading.
	"""
	match = re.search(r'\{[\s\S]*\}', text)
	if match:
	json_out = match.group(0)
	else:
	json_out = text
	try:
	# Using fuzzy json loader
	return loads(json_out)
	except Exception:
	# Using JSON fixer/ Fixes even half json/ Remove if you need an exception
	fix_json = JSONFixer()
	return loads(fix_json.fix(json_out).line)

	def remove_stopwords(text):
	stop_words = set(stopwords.words('english'))
	words = word_tokenize(text)
	filtered_text = [word for word in words if word.lower() not in stop_words]
	return ' '.join(filtered_text)

	def rephrase_content(content, query):
	return together_response(f"You are an information retriever,ignore everything you know, return only the\
	numerical or quantitative data regarding the query: {{{query}}} structured into markdown tables only \
	, using the scraped context:{{{limit_tokens(content)}}}")

	class Scraper:
	def __init__(self, user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"):
	self.session = requests.Session()
	self.session.headers.update({"User-Agent": user_agent})

	@retry(tries=3, delay=1)
	def fetch_content(self, url):
	try:
	response = self.session.get(url, timeout=2)
	if response.status_code == 200:
	return response.text
	except requests.exceptions.RequestException as e:
	print(f"Error fetching page content for {url}: {e}")
	return None

	def extract_main_content(html):
	if html:
	soup = BeautifulSoup(html, 'lxml')
	paragraphs = soup.find_all('p')
	text = ' '.join(p.get_text() for p in paragraphs)
	return text
	return ""

	def process_content(url, query):
	scraper = Scraper()
	html_content = scraper.fetch_content(url)
	if html_content:
	content = extract_main_content(html_content)
	if content:
	rephrased_content = rephrase_content(remove_stopwords(content)[:4096*4], query)
	return rephrased_content, url
	return "", url

	def fetch_and_extract_content(urls, query):
	with ThreadPoolExecutor(max_workers=len(urls)) as executor:
	future_to_url = {executor.submit(process_content, url, query): url for url in urls}
	all_text_with_urls = [future.result() for future in as_completed(future_to_url)]

	return all_text_with_urls

	def search_brave(query, num_results=5):

	brave = Brave(BRAVE_API_KEY)

	search_results = brave.search(q=query, count=num_results)

	return [url.__str__() for url in search_results.urls]

	def generate_report_with_reference(full_data):
	"""
	Generate HTML report with references and saves pdf report to "generated_pdf_report.pdf"
	"""
	pdf = FPDF()
	with open("report_with_references_template.html") as f: # src/research-pro/app_v1.5_online/
	html_template = f.read()

	# Loop through each row in your dataset
	html_report = ''
	idx = 1
	for subtopic_data in full_data:

	md_report = md_to_html(subtopic_data['md_report'])
	# Convert the string representation of a list of tuples back to a list of tuples
	references = ast.literal_eval(subtopic_data['text_with_urls'])

	collapsible_blocks = []
	for ref_idx, reference in enumerate(references):
	ref_text = md_to_html(reference[0])
	ref_url = reference[1]
	urls_html = ''.join(f'<a href="{ref_url}"> {ref_url}</a>')

	collapsible_block = '''
	<details>
	<summary>Reference {}: {}</summary>
	<div>
	<p>{}</p>
	<ul>{}</ul>
	</div>
	</details>
	'''.format(ref_idx+1, urls_html, ref_text, urls_html)

	collapsible_blocks.append(collapsible_block)

	references_html = '\n'.join(collapsible_blocks)

	template = Template(html_template)
	html_page = template.render(md_report=md_report, references=references_html)

	pdf.add_page()
	pdf_report = f"<h1><strong>Report {idx}</strong></h1>"+md_report+f"<h1><strong>References for Report {idx}</strong></h1>"+references_html

	pdf.write_html(pdf_report.encode('ascii', 'ignore').decode('ascii')) # Filter non-asci characters
	html_report += html_page
	idx+=1

	pdf.output("generated_pdf_report.pdf")
	return html_report

	def write_dataframes_to_excel(dataframes_list, filename):
	"""
	input: [df_list1, df_list2, ..]
	saves filename.xlsx
	"""
	try:
	with pd.ExcelWriter(filename, engine="openpyxl") as writer:
	for idx, dataframes in enumerate(dataframes_list):
	startrow = 0
	for idx2, df in enumerate(dataframes):
	df.to_excel(writer, sheet_name=f"Sheet{idx+1}", startrow=startrow, index=False)
	startrow += len(df) + 2
	except:
	# Empty dataframe due to no tables found, file is not written
	pass

	def extract_tables_from_html(html_file):
	"""
	input: html_file
	output: [df1,df2,df3,..]
	"""
	# Initialize an empty list to store the dataframes
	dataframes = []

	# Open the HTML file and parse it with BeautifulSoup
	soup = BeautifulSoup(html_file, 'html.parser')

	# Find all the tables in the HTML file
	tables = soup.find_all('table')

	# Iterate through each table
	for table in tables:
	# Extract the table headers
	headers = [th.text for th in table.find_all('th')]

	# Extract the table data
	rows = table.find_all('tr')
	data = []
	for row in rows:
	row_data = [td.text for td in row.find_all('td')]
	data.append(row_data)

	# Create a dataframe from the headers and data
	df = pd.DataFrame(data, columns=headers)

	# Append the dataframe to the list of dataframes
	dataframes.append(df)

	# Return the list of dataframes
	return dataframes