IamVicky111 commited on
Commit
78d79d6
1 Parent(s): bb8ca0c

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +85 -0
  2. packages.txt +8 -0
  3. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ from scrapegraphai.graphs import SmartScraperGraph
4
+ from scrapegraphai.utils import prettify_exec_info
5
+ from langchain_community.llms import HuggingFaceEndpoint
6
+ from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
7
+ import gradio as gr
8
+ import subprocess
9
+ import json
10
+
11
+ # Ensure Playwright installs required browsers and dependencies
12
+ subprocess.run(["playwright", "install"])
13
+ #subprocess.run(["playwright", "install-deps"])
14
+
15
+ # Load environment variables
16
+ load_dotenv()
17
+ HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
18
+
19
+ # Initialize the model instances
20
+ repo_id = "mistralai/Mistral-7B-Instruct-v0.2"
21
+ llm_model_instance = HuggingFaceEndpoint(
22
+ repo_id=repo_id, max_length=128, temperature=0.3, token=HUGGINGFACEHUB_API_TOKEN
23
+ )
24
+
25
+ embedder_model_instance = HuggingFaceInferenceAPIEmbeddings(
26
+ api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2"
27
+ )
28
+
29
+ graph_config = {
30
+ "llm": {"model_instance": llm_model_instance},
31
+ "embeddings": {"model_instance": embedder_model_instance}
32
+ }
33
+
34
+ def scrape_and_summarize(prompt, source):
35
+ smart_scraper_graph = SmartScraperGraph(
36
+ prompt=prompt,
37
+ source=source,
38
+ config=graph_config
39
+ )
40
+ result = smart_scraper_graph.run()
41
+
42
+ # Ensure the result is properly formatted as JSON
43
+ if isinstance(result, dict):
44
+ result_json = result
45
+ else:
46
+ try:
47
+ result_json = json.loads(result)
48
+ except json.JSONDecodeError as e:
49
+ # Attempt to extract JSON from the result
50
+ start_index = result.find("[")
51
+ end_index = result.rfind("]")
52
+ if start_index != -1 and end_index != -1:
53
+ json_str = result[start_index:end_index+1]
54
+ try:
55
+ result_json = json.loads(json_str)
56
+ except json.JSONDecodeError as inner_e:
57
+ raise ValueError(f"Invalid JSON output: {result}") from inner_e
58
+ else:
59
+ raise ValueError(f"Invalid JSON output: {result}") from e
60
+
61
+ return result_json
62
+
63
+ # Gradio interface
64
+ with gr.Blocks() as demo:
65
+ gr.Markdown("<h1>Websites Scraper using Mistral AI</h1>")
66
+ gr.Markdown("""This is a no code ML app for scraping <br> 1. Just provide the Prompt, ie., the items you wanna Scrap from the website <br> 2. Provide the url for the site you wanna Scrap, click Generate<br> And BOOM 💥 you can copy the result and view the execution details in the right side pannel """)
67
+
68
+ with gr.Row():
69
+ with gr.Column():
70
+ prompt_input = gr.Textbox(label="Prompt", value="List me all the hospital or clinic names and their opening closing time, if the mobile number is present provide it too.")
71
+ source_input = gr.Textbox(label="Source URL", value="https://www.yelp.com/biz/all-smiles-dental-san-francisco-5?osq=dentist")
72
+ scrape_button = gr.Button("Generate")
73
+
74
+ with gr.Column():
75
+ result_output = gr.JSON(label="Result")
76
+
77
+ scrape_button.click(
78
+ scrape_and_summarize,
79
+ inputs=[prompt_input, source_input],
80
+ outputs=[result_output]
81
+ )
82
+
83
+ # Launch the Gradio app
84
+ if __name__ == "__main__":
85
+ demo.launch()
packages.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ libnss3
2
+ libnspr4
3
+ libatk1.0-0
4
+ libatk-bridge2.0-0
5
+ libcups2
6
+ libatspi2.0-0
7
+ libxcomposite1
8
+ libxdamage1
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio==4.31.3
2
+ langchain_community==0.0.38
3
+ python-dotenv==1.0.1
4
+ scrapegraphai==1.2.3
5
+ playwright==1.43.0