Spaces:
Runtime error
Runtime error
priceofdev
commited on
Commit
•
392589a
1
Parent(s):
6eff298
deploy
Browse files- .gitattributes +2 -1
- .gitignore +4 -0
- AssistantService.py +22 -0
- ExcecuteFunction.py +9 -0
- README.md +6 -4
- app.py +107 -0
- chains/code_generator/base.py +19 -0
- chains/code_generator/templates.py +58 -0
- chains/output_format/base.py +19 -0
- chains/output_format/templates.py +30 -0
- config.ini.example +2 -0
- requirements.txt +74 -0
- utils/SaveFunction.py +4 -0
.gitattributes
CHANGED
@@ -25,7 +25,6 @@
|
|
25 |
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
*.wasm filter=lfs diff=lfs merge=lfs -text
|
@@ -33,3 +32,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
25 |
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
|
|
28 |
*.tflite filter=lfs diff=lfs merge=lfs -text
|
29 |
*.tgz filter=lfs diff=lfs merge=lfs -text
|
30 |
*.wasm filter=lfs diff=lfs merge=lfs -text
|
|
|
32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
35 |
+
Vídeo[[:space:]]sin[[:space:]]título[[:space:]]‐[[:space:]]Hecho[[:space:]]con[[:space:]]Clipchamp[[:space:]](1).mp4 filter=lfs diff=lfs merge=lfs -text
|
36 |
+
demo.mp4 filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
config.ini
|
2 |
+
__pycache__/
|
3 |
+
.streamlit/
|
4 |
+
secrets.toml
|
AssistantService.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain.chat_models import ChatOpenAI
|
2 |
+
from chains.output_format.base import chain_output_format
|
3 |
+
from chains.code_generator.base import chain_code_generator
|
4 |
+
import os
|
5 |
+
|
6 |
+
class GPTAssistant():
|
7 |
+
def __init__(self,api_key:str):
|
8 |
+
os.environ['OPENAI_API_KEY'] = api_key
|
9 |
+
self.llm = ChatOpenAI(temperature=0, model_name='gpt-3.5-turbo', request_timeout=120, client=None)
|
10 |
+
|
11 |
+
def chain_response_format(self, html_content):
|
12 |
+
# prompt templates
|
13 |
+
output_format_chain = chain_output_format(self.llm)
|
14 |
+
|
15 |
+
# chain
|
16 |
+
return output_format_chain.run(html_content=html_content)
|
17 |
+
|
18 |
+
def chain_code_generator(self, output_format, html_content):
|
19 |
+
# Prompt templates
|
20 |
+
script_chain = chain_code_generator(self.llm)
|
21 |
+
|
22 |
+
return script_chain.run(output_format=output_format, html_content=html_content)
|
ExcecuteFunction.py
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import importlib
|
2 |
+
|
3 |
+
def execute_function():
|
4 |
+
module = "output"
|
5 |
+
function = "extract_info"
|
6 |
+
module = importlib.import_module(module)
|
7 |
+
function = getattr(module, function)
|
8 |
+
print("returning function")
|
9 |
+
return function
|
README.md
CHANGED
@@ -1,12 +1,14 @@
|
|
1 |
---
|
2 |
-
title: GPT
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
colorTo: purple
|
6 |
sdk: streamlit
|
7 |
-
sdk_version: 1.
|
8 |
app_file: app.py
|
9 |
pinned: false
|
|
|
|
|
10 |
---
|
11 |
|
12 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
+
title: GPT Auto Webscraping
|
3 |
+
emoji: 🍧
|
4 |
+
colorFrom: gray
|
5 |
colorTo: purple
|
6 |
sdk: streamlit
|
7 |
+
sdk_version: 1.21.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
+
license: mit
|
11 |
+
duplicated_from: CognitiveLabs/GPT-auto-webscraping
|
12 |
---
|
13 |
|
14 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from AssistantService import GPTAssistant
|
2 |
+
from openai.error import AuthenticationError
|
3 |
+
import streamlit as st
|
4 |
+
from langsmith.run_helpers import traceable
|
5 |
+
import configparser
|
6 |
+
import os
|
7 |
+
|
8 |
+
config = configparser.ConfigParser()
|
9 |
+
config.read('config.ini')
|
10 |
+
if 'DEFAULT' in config:
|
11 |
+
assistant_api_key = config['DEFAULT'].get('API-KEY', '')
|
12 |
+
|
13 |
+
os.environ["LANGCHAIN_TRACING_V2"]="true"
|
14 |
+
os.environ["LANGCHAIN_ENDPOINT"]="https://api.smith.langchain.com"
|
15 |
+
os.environ["LANGCHAIN_API_KEY"]=st.secrets["LANGCHAIN_API_KEY"]
|
16 |
+
os.environ["LANGCHAIN_PROJECT"]=st.secrets["LANGCHAIN_PROJECT"]
|
17 |
+
|
18 |
+
@traceable(run_type="tool")
|
19 |
+
def start_session(session_started):
|
20 |
+
st.session_state['session_started'] = session_started
|
21 |
+
return session_started
|
22 |
+
|
23 |
+
# change session_started to True
|
24 |
+
if 'session_started' not in st.session_state:
|
25 |
+
start_session(True)
|
26 |
+
|
27 |
+
st.write("This app helps you to extract data from HTML code using web scraping. It uses GPT-3.5-turbo to generate the code for you. \n *Contribute to this project on [GitHub](https://github.com/princeofdev/ai-gpt-scraping)*")
|
28 |
+
|
29 |
+
with st.expander(label="Check out the video demo"):
|
30 |
+
yt_video = st.video("https://www.youtube.com/watch?v=_zeCun4OlCc")
|
31 |
+
|
32 |
+
info_text = """
|
33 |
+
**Quick start** \n
|
34 |
+
Fill the input with <HTML code>.
|
35 |
+
- Choose a repeating element on the page, like a product on a list.
|
36 |
+
- Inspect the HTML code and copy the element.
|
37 |
+
- After generating the "output format" and the code, paste the complete HTML code of the page in the last input to test it
|
38 |
+
"""
|
39 |
+
st.write(info_text)
|
40 |
+
st.image("https://j.gifs.com/gpqvPl.gif", width=600)
|
41 |
+
|
42 |
+
|
43 |
+
|
44 |
+
if assistant_api_key == '':
|
45 |
+
assistant_api_key = st.secrets["API_KEY"]
|
46 |
+
if assistant_api_key:
|
47 |
+
gpt_assistant = GPTAssistant(assistant_api_key)
|
48 |
+
else:
|
49 |
+
gpt_assistant = GPTAssistant(assistant_api_key)
|
50 |
+
|
51 |
+
# get the html content
|
52 |
+
html_content = st.text_input("Paste the HTML tags of the item you want to extract:", max_chars=10000, help="example: <li>Product 1 </li>, watch the video above")
|
53 |
+
# check if html_content is an url, and show error if it is
|
54 |
+
if html_content:
|
55 |
+
if html_content.startswith("http"):
|
56 |
+
st.write("Please paste the HTML piece code, not the URL")
|
57 |
+
html_content = None
|
58 |
+
|
59 |
+
extract_button = st.button("Generate output format & code")
|
60 |
+
|
61 |
+
|
62 |
+
if html_content and extract_button:
|
63 |
+
try:
|
64 |
+
st.write("1/2: Generating the output format...")
|
65 |
+
output = gpt_assistant.chain_response_format(html_content)
|
66 |
+
st.session_state['output_format'] = output
|
67 |
+
except NameError:
|
68 |
+
st.write("Complete the API key field")
|
69 |
+
except AuthenticationError:
|
70 |
+
st.write("Invalid API key")
|
71 |
+
|
72 |
+
if 'output_format' in st.session_state:
|
73 |
+
output_format = st.code(st.session_state['output_format'], language="json")
|
74 |
+
|
75 |
+
try:
|
76 |
+
st.write("2/2: Generating the code...")
|
77 |
+
python_code = gpt_assistant.chain_code_generator(st.session_state['output_format'], html_content)
|
78 |
+
st.session_state['code_generated'] = python_code
|
79 |
+
st.session_state['code_generated_exec'] = python_code + "\nresult = extract_info(html_data)"
|
80 |
+
|
81 |
+
except NameError:
|
82 |
+
st.write("Complete the API key field")
|
83 |
+
except AuthenticationError:
|
84 |
+
st.write("Invalid API key")
|
85 |
+
|
86 |
+
@traceable(run_type="tool")
|
87 |
+
def test_the_code(code, full_content):
|
88 |
+
exec(code, globals())
|
89 |
+
if result:
|
90 |
+
st.write("data extracted successfully")
|
91 |
+
# show data in table
|
92 |
+
st.table(result)
|
93 |
+
else:
|
94 |
+
st.write("error extracting data")
|
95 |
+
|
96 |
+
return result or "error"
|
97 |
+
|
98 |
+
|
99 |
+
if 'code_generated' in st.session_state:
|
100 |
+
python_function_label = st.write("Here is your python function:")
|
101 |
+
code_generated = st.code(st.session_state['code_generated'],language="python")
|
102 |
+
full_content = st.text_input("Paste your complete HTML here:")
|
103 |
+
test_code = st.button("Test the code")
|
104 |
+
if full_content and test_code:
|
105 |
+
html_data = full_content
|
106 |
+
result = None
|
107 |
+
test_the_code(st.session_state['code_generated_exec'], full_content=full_content)
|
chains/code_generator/base.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain.chains import LLMChain
|
2 |
+
from langchain.memory import ConversationBufferMemory
|
3 |
+
from chains.code_generator.templates import chat_script_prompt
|
4 |
+
|
5 |
+
|
6 |
+
def chain_code_generator(llm) -> LLMChain:
|
7 |
+
# Memory
|
8 |
+
script_memory = ConversationBufferMemory(
|
9 |
+
input_key="output_format", memory_key="chat_history"
|
10 |
+
)
|
11 |
+
|
12 |
+
# Chain
|
13 |
+
return LLMChain(
|
14 |
+
llm=llm,
|
15 |
+
prompt=chat_script_prompt,
|
16 |
+
verbose=True,
|
17 |
+
output_key="script",
|
18 |
+
memory=script_memory,
|
19 |
+
)
|
chains/code_generator/templates.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain.prompts import (
|
2 |
+
SystemMessagePromptTemplate,
|
3 |
+
HumanMessagePromptTemplate,
|
4 |
+
ChatPromptTemplate,
|
5 |
+
PromptTemplate,
|
6 |
+
)
|
7 |
+
|
8 |
+
# Prompt templates
|
9 |
+
system_template_script = PromptTemplate(
|
10 |
+
input_variables=["output_format", "html_content"],
|
11 |
+
template="""You are a helpful assitant that helps people create python scripts for web scraping.
|
12 |
+
--------------------------------
|
13 |
+
The example of the html content is: {html_content}
|
14 |
+
--------------------------------
|
15 |
+
You have to create a python function that extract information from an html code using web scrapping.
|
16 |
+
|
17 |
+
Try to select the deeper class that is common among the elements to make de find_all function.
|
18 |
+
|
19 |
+
Your answer SHOULD only contain the python function code without any aditional word or character.
|
20 |
+
|
21 |
+
Import the used libraries above the function definition.
|
22 |
+
|
23 |
+
The function name must be extract_info.
|
24 |
+
|
25 |
+
The function have to receive the html data as a parameter.
|
26 |
+
|
27 |
+
Your function needs to extract information for all the elements with similar attributes.
|
28 |
+
|
29 |
+
An element could have missing attributes
|
30 |
+
|
31 |
+
Before calling .text or ['href'] methods, check if the element exists.
|
32 |
+
|
33 |
+
----------------
|
34 |
+
FINAL ANSWER EXAMPLE:
|
35 |
+
from bs4 import BeautifulSoup
|
36 |
+
|
37 |
+
def extract_info(html):
|
38 |
+
...CODE...
|
39 |
+
return {output_format}
|
40 |
+
----------------
|
41 |
+
|
42 |
+
Always check if the element exists before calling some method.
|
43 |
+
|
44 |
+
""",
|
45 |
+
)
|
46 |
+
|
47 |
+
human_template_script = PromptTemplate(input_variables=[], template="give me the code")
|
48 |
+
|
49 |
+
# Chat Prompt objects
|
50 |
+
system_template_script_prompt = SystemMessagePromptTemplate.from_template(
|
51 |
+
system_template_script.template
|
52 |
+
)
|
53 |
+
human_template_script_prompt = HumanMessagePromptTemplate.from_template(
|
54 |
+
human_template_script.template
|
55 |
+
)
|
56 |
+
chat_script_prompt = ChatPromptTemplate.from_messages(
|
57 |
+
[system_template_script_prompt, human_template_script_prompt]
|
58 |
+
)
|
chains/output_format/base.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain.chains import LLMChain
|
2 |
+
from langchain.memory import ConversationBufferMemory
|
3 |
+
from chains.output_format.templates import output_format_chat_prompt
|
4 |
+
|
5 |
+
|
6 |
+
def chain_output_format(llm) -> LLMChain:
|
7 |
+
# memory
|
8 |
+
html_memory = ConversationBufferMemory(
|
9 |
+
input_key="html_content", memory_key="chat_history"
|
10 |
+
)
|
11 |
+
|
12 |
+
# chain
|
13 |
+
return LLMChain(
|
14 |
+
llm=llm,
|
15 |
+
prompt=output_format_chat_prompt,
|
16 |
+
verbose=True,
|
17 |
+
output_key="output_format",
|
18 |
+
memory=html_memory,
|
19 |
+
)
|
chains/output_format/templates.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate, PromptTemplate
|
2 |
+
|
3 |
+
# prompt templates
|
4 |
+
system_template_output_format = PromptTemplate(
|
5 |
+
input_variables = ['html_content'],
|
6 |
+
template='''You are a helpful assitant that helps people extract JSON information from HTML content.
|
7 |
+
|
8 |
+
The input is a HTML content.
|
9 |
+
|
10 |
+
The expected output is a JSON with a relevant information in the following html: {html_content}
|
11 |
+
|
12 |
+
Try to extract as much information as possible. Including images, links, etc.
|
13 |
+
|
14 |
+
The assitant answer should ONLY contain the JSON information without any aditional word or character.
|
15 |
+
|
16 |
+
The JSON output must have 1 depth level as much.
|
17 |
+
|
18 |
+
The expected output format is an array of objects.
|
19 |
+
|
20 |
+
''')
|
21 |
+
|
22 |
+
human_template_output_format = PromptTemplate(
|
23 |
+
input_variables = ['html_content'],
|
24 |
+
template='this is the html content: {html_content}'
|
25 |
+
)
|
26 |
+
|
27 |
+
# chat prompts objects
|
28 |
+
system_message_prompt = SystemMessagePromptTemplate.from_template(system_template_output_format.template)
|
29 |
+
human_message_prompt = HumanMessagePromptTemplate.from_template(human_template_output_format.template)
|
30 |
+
output_format_chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])
|
config.ini.example
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
[DEFAULT]
|
2 |
+
API-KEY=OpenAI API KEY HERE
|
requirements.txt
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
aiohttp==3.8.4
|
2 |
+
aiosignal==1.3.1
|
3 |
+
altair==4.2.2
|
4 |
+
async-timeout==4.0.2
|
5 |
+
attrs==23.1.0
|
6 |
+
beautifulsoup4==4.12.2
|
7 |
+
blinker==1.6.2
|
8 |
+
cachetools==5.3.0
|
9 |
+
certifi==2023.5.7
|
10 |
+
charset-normalizer==3.1.0
|
11 |
+
click==8.1.3
|
12 |
+
colorama==0.4.6
|
13 |
+
dataclasses-json==0.5.7
|
14 |
+
decorator==5.1.1
|
15 |
+
entrypoints==0.4
|
16 |
+
frozenlist==1.3.3
|
17 |
+
gitdb==4.0.10
|
18 |
+
GitPython==3.1.31
|
19 |
+
greenlet==2.0.2
|
20 |
+
idna==3.4
|
21 |
+
importlib-metadata==6.6.0
|
22 |
+
Jinja2==3.1.2
|
23 |
+
jsonschema==4.17.3
|
24 |
+
langchain==0.0.234
|
25 |
+
langsmith==0.0.5
|
26 |
+
lxml==4.9.2
|
27 |
+
markdown-it-py==2.2.0
|
28 |
+
MarkupSafe==2.1.2
|
29 |
+
marshmallow==3.19.0
|
30 |
+
marshmallow-enum==1.5.1
|
31 |
+
mdurl==0.1.2
|
32 |
+
MechanicalSoup==1.2.0
|
33 |
+
multidict==6.0.4
|
34 |
+
mypy-extensions==1.0.0
|
35 |
+
numexpr==2.8.4
|
36 |
+
numpy==1.24.3
|
37 |
+
openai==0.27.6
|
38 |
+
openapi-schema-pydantic==1.2.4
|
39 |
+
packaging==23.1
|
40 |
+
pandas==2.0.1
|
41 |
+
Pillow==9.5.0
|
42 |
+
protobuf==3.20.3
|
43 |
+
pyarrow==12.0.0
|
44 |
+
pydantic==1.10.7
|
45 |
+
pydeck==0.8.1b0
|
46 |
+
Pygments==2.15.1
|
47 |
+
Pympler==1.0.1
|
48 |
+
pyrsistent==0.19.3
|
49 |
+
python-dateutil==2.8.2
|
50 |
+
pytz==2023.3
|
51 |
+
pytz-deprecation-shim==0.1.0.post0
|
52 |
+
PyYAML==6.0
|
53 |
+
requests==2.30.0
|
54 |
+
rich==13.3.5
|
55 |
+
six==1.16.0
|
56 |
+
smmap==5.0.0
|
57 |
+
soupsieve==2.4.1
|
58 |
+
SQLAlchemy==2.0.13
|
59 |
+
streamlit==1.22.0
|
60 |
+
streamlit-ace==0.1.1
|
61 |
+
tenacity==8.2.2
|
62 |
+
toml==0.10.2
|
63 |
+
toolz==0.12.0
|
64 |
+
tornado==6.3.1
|
65 |
+
tqdm==4.65.0
|
66 |
+
typing-inspect==0.8.0
|
67 |
+
typing_extensions==4.5.0
|
68 |
+
tzdata==2023.3
|
69 |
+
tzlocal==4.3
|
70 |
+
urllib3==2.0.2
|
71 |
+
validators==0.20.0
|
72 |
+
watchdog==3.0.0
|
73 |
+
yarl==1.9.2
|
74 |
+
zipp==3.15.0
|
utils/SaveFunction.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def save_function(code):
|
2 |
+
function = open('output.py', 'w')
|
3 |
+
function.write(code)
|
4 |
+
function.close()
|