Spaces:

sadidul012
/

test-space

Sleeping

App Files Files Community

test-space / app.py

sadidul012

big text

f1b3791 8 months ago

raw

history blame

2.98 kB

	import html
	import os
	from dotenv import load_dotenv
	from scrapegraphai.graphs import SmartScraperGraph
	from scrapegraphai.utils import prettify_exec_info
	from langchain_community.llms import HuggingFaceEndpoint
	from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
	import gradio as gr
	import subprocess

	# Ensure Playwright installs required browsers and dependencies
	subprocess.run(["playwright", "install"])
	# subprocess.run(["playwright", "install-deps"])

	# Load environment variables
	load_dotenv()
	HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')

	# Initialize the model instances
	repo_id = "mistralai/Mistral-7B-Instruct-v0.2"
	llm_model_instance = HuggingFaceEndpoint(
	repo_id=repo_id, max_length=128, temperature=0.5, token=HUGGINGFACEHUB_API_TOKEN
	)

	embedder_model_instance = HuggingFaceInferenceAPIEmbeddings(
	api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2"
	)

	graph_config = {
	"llm": {"model_instance": llm_model_instance},
	"embeddings": {"model_instance": embedder_model_instance},
	"headless": False
	}


	def scrape_and_summarize(prompt, source):
	with open("file.html", "w") as file:
	file.write(html.unescape(source))

	# with open("file.html", "r") as file:
	# text = file.read()
	# return {"prompt": prompt}, {"source": text}
	smart_scraper_graph = SmartScraperGraph(
	prompt=prompt,
	source="file.html",
	# source=source,
	config=graph_config
	)
	result = smart_scraper_graph.run()
	exec_info = smart_scraper_graph.get_execution_info()
	return result, prettify_exec_info(exec_info)


	# Gradio interface
	with gr.Blocks() as demo:
	gr.Markdown("# Scrape websites, no-code version")
	gr.Markdown("""Easily scrape and summarize web content using advanced AI models on the Hugging Face Hub without writing any code. Input your desired prompt and source URL to get started.
	This is a no-code version of the excellent lib [ScrapeGraphAI](https://github.com/VinciGit00/Scrapegraph-ai).
	It's a basic demo and a work in progress. Please contribute to it to make it more useful!""")

	with gr.Row():
	with gr.Column():
	model_dropdown = gr.Textbox(label="Model", value="Mistral-7B-Instruct-v0.2")
	prompt_input = gr.Textbox(label="Prompt", value="List me all the press releases with their headlines and urls.")
	source_input = gr.Textbox(label="Source", value="https://www.whitehouse.gov/")
	scrape_button = gr.Button("Scrape and Summarize")

	with gr.Column():
	result_output = gr.JSON(label="Result")
	exec_info_output = gr.Textbox(label="Execution Info")

	scrape_button.click(
	scrape_and_summarize,
	inputs=[prompt_input, source_input],
	outputs=[result_output, exec_info_output]
	)

	# Launch the Gradio app
	if __name__ == "__main__":
	demo.launch()