priceofdev commited on
Commit
392589a
1 Parent(s): 6eff298
.gitattributes CHANGED
@@ -25,7 +25,6 @@
25
  *.safetensors filter=lfs diff=lfs merge=lfs -text
26
  saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
  *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
  *.tflite filter=lfs diff=lfs merge=lfs -text
30
  *.tgz filter=lfs diff=lfs merge=lfs -text
31
  *.wasm filter=lfs diff=lfs merge=lfs -text
@@ -33,3 +32,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
25
  *.safetensors filter=lfs diff=lfs merge=lfs -text
26
  saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
  *.tar.* filter=lfs diff=lfs merge=lfs -text
 
28
  *.tflite filter=lfs diff=lfs merge=lfs -text
29
  *.tgz filter=lfs diff=lfs merge=lfs -text
30
  *.wasm filter=lfs diff=lfs merge=lfs -text
 
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ Vídeo[[:space:]]sin[[:space:]]título[[:space:]]‐[[:space:]]Hecho[[:space:]]con[[:space:]]Clipchamp[[:space:]](1).mp4 filter=lfs diff=lfs merge=lfs -text
36
+ demo.mp4 filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ config.ini
2
+ __pycache__/
3
+ .streamlit/
4
+ secrets.toml
AssistantService.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.chat_models import ChatOpenAI
2
+ from chains.output_format.base import chain_output_format
3
+ from chains.code_generator.base import chain_code_generator
4
+ import os
5
+
6
+ class GPTAssistant():
7
+ def __init__(self,api_key:str):
8
+ os.environ['OPENAI_API_KEY'] = api_key
9
+ self.llm = ChatOpenAI(temperature=0, model_name='gpt-3.5-turbo', request_timeout=120, client=None)
10
+
11
+ def chain_response_format(self, html_content):
12
+ # prompt templates
13
+ output_format_chain = chain_output_format(self.llm)
14
+
15
+ # chain
16
+ return output_format_chain.run(html_content=html_content)
17
+
18
+ def chain_code_generator(self, output_format, html_content):
19
+ # Prompt templates
20
+ script_chain = chain_code_generator(self.llm)
21
+
22
+ return script_chain.run(output_format=output_format, html_content=html_content)
ExcecuteFunction.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import importlib
2
+
3
+ def execute_function():
4
+ module = "output"
5
+ function = "extract_info"
6
+ module = importlib.import_module(module)
7
+ function = getattr(module, function)
8
+ print("returning function")
9
+ return function
README.md CHANGED
@@ -1,12 +1,14 @@
1
  ---
2
- title: GPT Scraping
3
- emoji: 🔥
4
- colorFrom: green
5
  colorTo: purple
6
  sdk: streamlit
7
- sdk_version: 1.26.0
8
  app_file: app.py
9
  pinned: false
 
 
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: GPT Auto Webscraping
3
+ emoji: 🍧
4
+ colorFrom: gray
5
  colorTo: purple
6
  sdk: streamlit
7
+ sdk_version: 1.21.0
8
  app_file: app.py
9
  pinned: false
10
+ license: mit
11
+ duplicated_from: CognitiveLabs/GPT-auto-webscraping
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from AssistantService import GPTAssistant
2
+ from openai.error import AuthenticationError
3
+ import streamlit as st
4
+ from langsmith.run_helpers import traceable
5
+ import configparser
6
+ import os
7
+
8
+ config = configparser.ConfigParser()
9
+ config.read('config.ini')
10
+ if 'DEFAULT' in config:
11
+ assistant_api_key = config['DEFAULT'].get('API-KEY', '')
12
+
13
+ os.environ["LANGCHAIN_TRACING_V2"]="true"
14
+ os.environ["LANGCHAIN_ENDPOINT"]="https://api.smith.langchain.com"
15
+ os.environ["LANGCHAIN_API_KEY"]=st.secrets["LANGCHAIN_API_KEY"]
16
+ os.environ["LANGCHAIN_PROJECT"]=st.secrets["LANGCHAIN_PROJECT"]
17
+
18
+ @traceable(run_type="tool")
19
+ def start_session(session_started):
20
+ st.session_state['session_started'] = session_started
21
+ return session_started
22
+
23
+ # change session_started to True
24
+ if 'session_started' not in st.session_state:
25
+ start_session(True)
26
+
27
+ st.write("This app helps you to extract data from HTML code using web scraping. It uses GPT-3.5-turbo to generate the code for you. \n *Contribute to this project on [GitHub](https://github.com/princeofdev/ai-gpt-scraping)*")
28
+
29
+ with st.expander(label="Check out the video demo"):
30
+ yt_video = st.video("https://www.youtube.com/watch?v=_zeCun4OlCc")
31
+
32
+ info_text = """
33
+ **Quick start** \n
34
+ Fill the input with <HTML code>.
35
+ - Choose a repeating element on the page, like a product on a list.
36
+ - Inspect the HTML code and copy the element.
37
+ - After generating the "output format" and the code, paste the complete HTML code of the page in the last input to test it
38
+ """
39
+ st.write(info_text)
40
+ st.image("https://j.gifs.com/gpqvPl.gif", width=600)
41
+
42
+
43
+
44
+ if assistant_api_key == '':
45
+ assistant_api_key = st.secrets["API_KEY"]
46
+ if assistant_api_key:
47
+ gpt_assistant = GPTAssistant(assistant_api_key)
48
+ else:
49
+ gpt_assistant = GPTAssistant(assistant_api_key)
50
+
51
+ # get the html content
52
+ html_content = st.text_input("Paste the HTML tags of the item you want to extract:", max_chars=10000, help="example: <li>Product 1 </li>, watch the video above")
53
+ # check if html_content is an url, and show error if it is
54
+ if html_content:
55
+ if html_content.startswith("http"):
56
+ st.write("Please paste the HTML piece code, not the URL")
57
+ html_content = None
58
+
59
+ extract_button = st.button("Generate output format & code")
60
+
61
+
62
+ if html_content and extract_button:
63
+ try:
64
+ st.write("1/2: Generating the output format...")
65
+ output = gpt_assistant.chain_response_format(html_content)
66
+ st.session_state['output_format'] = output
67
+ except NameError:
68
+ st.write("Complete the API key field")
69
+ except AuthenticationError:
70
+ st.write("Invalid API key")
71
+
72
+ if 'output_format' in st.session_state:
73
+ output_format = st.code(st.session_state['output_format'], language="json")
74
+
75
+ try:
76
+ st.write("2/2: Generating the code...")
77
+ python_code = gpt_assistant.chain_code_generator(st.session_state['output_format'], html_content)
78
+ st.session_state['code_generated'] = python_code
79
+ st.session_state['code_generated_exec'] = python_code + "\nresult = extract_info(html_data)"
80
+
81
+ except NameError:
82
+ st.write("Complete the API key field")
83
+ except AuthenticationError:
84
+ st.write("Invalid API key")
85
+
86
+ @traceable(run_type="tool")
87
+ def test_the_code(code, full_content):
88
+ exec(code, globals())
89
+ if result:
90
+ st.write("data extracted successfully")
91
+ # show data in table
92
+ st.table(result)
93
+ else:
94
+ st.write("error extracting data")
95
+
96
+ return result or "error"
97
+
98
+
99
+ if 'code_generated' in st.session_state:
100
+ python_function_label = st.write("Here is your python function:")
101
+ code_generated = st.code(st.session_state['code_generated'],language="python")
102
+ full_content = st.text_input("Paste your complete HTML here:")
103
+ test_code = st.button("Test the code")
104
+ if full_content and test_code:
105
+ html_data = full_content
106
+ result = None
107
+ test_the_code(st.session_state['code_generated_exec'], full_content=full_content)
chains/code_generator/base.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.chains import LLMChain
2
+ from langchain.memory import ConversationBufferMemory
3
+ from chains.code_generator.templates import chat_script_prompt
4
+
5
+
6
+ def chain_code_generator(llm) -> LLMChain:
7
+ # Memory
8
+ script_memory = ConversationBufferMemory(
9
+ input_key="output_format", memory_key="chat_history"
10
+ )
11
+
12
+ # Chain
13
+ return LLMChain(
14
+ llm=llm,
15
+ prompt=chat_script_prompt,
16
+ verbose=True,
17
+ output_key="script",
18
+ memory=script_memory,
19
+ )
chains/code_generator/templates.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.prompts import (
2
+ SystemMessagePromptTemplate,
3
+ HumanMessagePromptTemplate,
4
+ ChatPromptTemplate,
5
+ PromptTemplate,
6
+ )
7
+
8
+ # Prompt templates
9
+ system_template_script = PromptTemplate(
10
+ input_variables=["output_format", "html_content"],
11
+ template="""You are a helpful assitant that helps people create python scripts for web scraping.
12
+ --------------------------------
13
+ The example of the html content is: {html_content}
14
+ --------------------------------
15
+ You have to create a python function that extract information from an html code using web scrapping.
16
+
17
+ Try to select the deeper class that is common among the elements to make de find_all function.
18
+
19
+ Your answer SHOULD only contain the python function code without any aditional word or character.
20
+
21
+ Import the used libraries above the function definition.
22
+
23
+ The function name must be extract_info.
24
+
25
+ The function have to receive the html data as a parameter.
26
+
27
+ Your function needs to extract information for all the elements with similar attributes.
28
+
29
+ An element could have missing attributes
30
+
31
+ Before calling .text or ['href'] methods, check if the element exists.
32
+
33
+ ----------------
34
+ FINAL ANSWER EXAMPLE:
35
+ from bs4 import BeautifulSoup
36
+
37
+ def extract_info(html):
38
+ ...CODE...
39
+ return {output_format}
40
+ ----------------
41
+
42
+ Always check if the element exists before calling some method.
43
+
44
+ """,
45
+ )
46
+
47
+ human_template_script = PromptTemplate(input_variables=[], template="give me the code")
48
+
49
+ # Chat Prompt objects
50
+ system_template_script_prompt = SystemMessagePromptTemplate.from_template(
51
+ system_template_script.template
52
+ )
53
+ human_template_script_prompt = HumanMessagePromptTemplate.from_template(
54
+ human_template_script.template
55
+ )
56
+ chat_script_prompt = ChatPromptTemplate.from_messages(
57
+ [system_template_script_prompt, human_template_script_prompt]
58
+ )
chains/output_format/base.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.chains import LLMChain
2
+ from langchain.memory import ConversationBufferMemory
3
+ from chains.output_format.templates import output_format_chat_prompt
4
+
5
+
6
+ def chain_output_format(llm) -> LLMChain:
7
+ # memory
8
+ html_memory = ConversationBufferMemory(
9
+ input_key="html_content", memory_key="chat_history"
10
+ )
11
+
12
+ # chain
13
+ return LLMChain(
14
+ llm=llm,
15
+ prompt=output_format_chat_prompt,
16
+ verbose=True,
17
+ output_key="output_format",
18
+ memory=html_memory,
19
+ )
chains/output_format/templates.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate, PromptTemplate
2
+
3
+ # prompt templates
4
+ system_template_output_format = PromptTemplate(
5
+ input_variables = ['html_content'],
6
+ template='''You are a helpful assitant that helps people extract JSON information from HTML content.
7
+
8
+ The input is a HTML content.
9
+
10
+ The expected output is a JSON with a relevant information in the following html: {html_content}
11
+
12
+ Try to extract as much information as possible. Including images, links, etc.
13
+
14
+ The assitant answer should ONLY contain the JSON information without any aditional word or character.
15
+
16
+ The JSON output must have 1 depth level as much.
17
+
18
+ The expected output format is an array of objects.
19
+
20
+ ''')
21
+
22
+ human_template_output_format = PromptTemplate(
23
+ input_variables = ['html_content'],
24
+ template='this is the html content: {html_content}'
25
+ )
26
+
27
+ # chat prompts objects
28
+ system_message_prompt = SystemMessagePromptTemplate.from_template(system_template_output_format.template)
29
+ human_message_prompt = HumanMessagePromptTemplate.from_template(human_template_output_format.template)
30
+ output_format_chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])
config.ini.example ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ [DEFAULT]
2
+ API-KEY=OpenAI API KEY HERE
requirements.txt ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiohttp==3.8.4
2
+ aiosignal==1.3.1
3
+ altair==4.2.2
4
+ async-timeout==4.0.2
5
+ attrs==23.1.0
6
+ beautifulsoup4==4.12.2
7
+ blinker==1.6.2
8
+ cachetools==5.3.0
9
+ certifi==2023.5.7
10
+ charset-normalizer==3.1.0
11
+ click==8.1.3
12
+ colorama==0.4.6
13
+ dataclasses-json==0.5.7
14
+ decorator==5.1.1
15
+ entrypoints==0.4
16
+ frozenlist==1.3.3
17
+ gitdb==4.0.10
18
+ GitPython==3.1.31
19
+ greenlet==2.0.2
20
+ idna==3.4
21
+ importlib-metadata==6.6.0
22
+ Jinja2==3.1.2
23
+ jsonschema==4.17.3
24
+ langchain==0.0.234
25
+ langsmith==0.0.5
26
+ lxml==4.9.2
27
+ markdown-it-py==2.2.0
28
+ MarkupSafe==2.1.2
29
+ marshmallow==3.19.0
30
+ marshmallow-enum==1.5.1
31
+ mdurl==0.1.2
32
+ MechanicalSoup==1.2.0
33
+ multidict==6.0.4
34
+ mypy-extensions==1.0.0
35
+ numexpr==2.8.4
36
+ numpy==1.24.3
37
+ openai==0.27.6
38
+ openapi-schema-pydantic==1.2.4
39
+ packaging==23.1
40
+ pandas==2.0.1
41
+ Pillow==9.5.0
42
+ protobuf==3.20.3
43
+ pyarrow==12.0.0
44
+ pydantic==1.10.7
45
+ pydeck==0.8.1b0
46
+ Pygments==2.15.1
47
+ Pympler==1.0.1
48
+ pyrsistent==0.19.3
49
+ python-dateutil==2.8.2
50
+ pytz==2023.3
51
+ pytz-deprecation-shim==0.1.0.post0
52
+ PyYAML==6.0
53
+ requests==2.30.0
54
+ rich==13.3.5
55
+ six==1.16.0
56
+ smmap==5.0.0
57
+ soupsieve==2.4.1
58
+ SQLAlchemy==2.0.13
59
+ streamlit==1.22.0
60
+ streamlit-ace==0.1.1
61
+ tenacity==8.2.2
62
+ toml==0.10.2
63
+ toolz==0.12.0
64
+ tornado==6.3.1
65
+ tqdm==4.65.0
66
+ typing-inspect==0.8.0
67
+ typing_extensions==4.5.0
68
+ tzdata==2023.3
69
+ tzlocal==4.3
70
+ urllib3==2.0.2
71
+ validators==0.20.0
72
+ watchdog==3.0.0
73
+ yarl==1.9.2
74
+ zipp==3.15.0
utils/SaveFunction.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ def save_function(code):
2
+ function = open('output.py', 'w')
3
+ function.write(code)
4
+ function.close()