import os from dotenv import load_dotenv from scrapegraphai.graphs import SmartScraperGraph from scrapegraphai.utils import prettify_exec_info from langchain_community.llms import HuggingFaceEndpoint from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings import gradio as gr import subprocess import json # Ensure Playwright installs required browsers and dependencies subprocess.run(["playwright", "install"]) #subprocess.run(["playwright", "install-deps"]) # Load environment variables load_dotenv() HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN') # Initialize the model instances repo_id = "mistralai/Mistral-7B-Instruct-v0.2" llm_model_instance = HuggingFaceEndpoint( repo_id=repo_id, max_length=128, temperature=0.3, token=HUGGINGFACEHUB_API_TOKEN ) embedder_model_instance = HuggingFaceInferenceAPIEmbeddings( api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2" ) graph_config = { "llm": {"model_instance": llm_model_instance}, "embeddings": {"model_instance": embedder_model_instance} } def scrape_and_summarize(prompt, source): smart_scraper_graph = SmartScraperGraph( prompt=prompt, source=source, config=graph_config ) result = smart_scraper_graph.run() # Ensure the result is properly formatted as JSON if isinstance(result, dict): result_json = result else: try: result_json = json.loads(result) except json.JSONDecodeError as e: # Attempt to extract JSON from the result start_index = result.find("[") end_index = result.rfind("]") if start_index != -1 and end_index != -1: json_str = result[start_index:end_index+1] try: result_json = json.loads(json_str) except json.JSONDecodeError as inner_e: raise ValueError(f"Invalid JSON output: {result}") from inner_e else: raise ValueError(f"Invalid JSON output: {result}") from e return result_json # Gradio interface with gr.Blocks() as demo: gr.Markdown("

Websites Scraper using Mistral AI

") gr.Markdown("""This is a no code ML app for scraping
1. Just provide the Prompt, ie., the items you wanna Scrap from the website
2. Provide the url for the site you wanna Scrap, click Generate
And BOOM 💥 you can copy the result and view the execution details in the right side pannel """) with gr.Row(): with gr.Column(): prompt_input = gr.Textbox(label="Prompt", value="List me all the hospital or clinic names and their opening closing time, if the mobile number is present provide it too.") source_input = gr.Textbox(label="Source URL", value="https://www.yelp.com/biz/all-smiles-dental-san-francisco-5?osq=dentist") scrape_button = gr.Button("Generate") with gr.Column(): result_output = gr.JSON(label="Result") scrape_button.click( scrape_and_summarize, inputs=[prompt_input, source_input], outputs=[result_output] ) # Launch the Gradio app if __name__ == "__main__": demo.launch()