File size: 3,006 Bytes
c60061d
ade1ed5
 
 
 
 
 
a857221
ade1ed5
a857221
ade1ed5
 
 
a857221
ade1ed5
 
 
a857221
ade1ed5
 
 
 
 
a857221
ade1ed5
 
 
 
 
 
7e92f73
 
ade1ed5
 
 
 
25dad4d
 
 
f1b3791
 
 
 
 
25dad4d
f1b3791
 
 
 
 
6bc109b
ade1ed5
 
 
 
 
 
 
 
 
 
 
 
b0c4c14
701311c
ade1ed5
 
 
 
 
 
 
 
91b50f7
ade1ed5
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import html
import os
from dotenv import load_dotenv
from scrapegraphai.graphs import SmartScraperGraph
from scrapegraphai.utils import prettify_exec_info
from langchain_community.llms import HuggingFaceEndpoint
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
import gradio as gr
import subprocess

# Ensure Playwright installs required browsers and dependencies
subprocess.run(["playwright", "install"])
# subprocess.run(["playwright", "install-deps"])

# Load environment variables
load_dotenv()
HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')

# Initialize the model instances
repo_id = "mistralai/Mistral-7B-Instruct-v0.2"
llm_model_instance = HuggingFaceEndpoint(
    repo_id=repo_id, max_length=128, temperature=0.5, token=HUGGINGFACEHUB_API_TOKEN
)

embedder_model_instance = HuggingFaceInferenceAPIEmbeddings(
    api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2"
)

graph_config = {
    "llm": {"model_instance": llm_model_instance},
    "embeddings": {"model_instance": embedder_model_instance},
    "headless": False
}


def scrape_and_summarize(prompt, source):
    # with open("file.html", "w") as file:
    #     file.write(html.unescape(source))
    #
    # with open("file.html", "r") as file:
    #     text = file.read()
    # return {"prompt": prompt}, {"source": text}
    smart_scraper_graph = SmartScraperGraph(
        prompt=prompt,
        source=html.unescape(source),
        # source=source,
        config=graph_config
    )
    result = smart_scraper_graph.run()
    exec_info = smart_scraper_graph.get_execution_info()
    return {"result": result}, prettify_exec_info(exec_info)


# Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# Scrape websites, no-code version")
    gr.Markdown("""Easily scrape and summarize web content using advanced AI models on the Hugging Face Hub without writing any code. Input your desired prompt and source URL to get started.
                This is a no-code version of the excellent lib [ScrapeGraphAI](https://github.com/VinciGit00/Scrapegraph-ai).
                It's a basic demo and a work in progress. Please contribute to it to make it more useful!""")

    with gr.Row():
        with gr.Column():
            model_dropdown = gr.Textbox(label="Model", value="Mistral-7B-Instruct-v0.2")
            prompt_input = gr.Textbox(label="Prompt", value="List me all the press releases with their headlines and urls.")
            source_input = gr.Textbox(label="Source", value="https://www.whitehouse.gov/")
            scrape_button = gr.Button("Scrape and Summarize")

        with gr.Column():
            result_output = gr.JSON(label="Result")
            exec_info_output = gr.Textbox(label="Execution Info")

    scrape_button.click(
        scrape_and_summarize,
        inputs=[prompt_input, source_input],
        outputs=[result_output, exec_info_output]
    )

# Launch the Gradio app
if __name__ == "__main__":
    demo.launch()