Spaces:

aifanextreme
/

GPT_Scraping

Runtime error

App Files Files Community

priceofdev commited on Sep 9, 2023

Commit

392589a

1 Parent(s): 6eff298

deploy

Browse files

Files changed (13) hide show

.gitattributes +2 -1
.gitignore +4 -0
AssistantService.py +22 -0
ExcecuteFunction.py +9 -0
README.md +6 -4
app.py +107 -0
chains/code_generator/base.py +19 -0
chains/code_generator/templates.py +58 -0
chains/output_format/base.py +19 -0
chains/output_format/templates.py +30 -0
config.ini.example +2 -0
requirements.txt +74 -0
utils/SaveFunction.py +4 -0

.gitattributes CHANGED Viewed

@@ -25,7 +25,6 @@
 *.safetensors filter=lfs diff=lfs merge=lfs -text
 saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
 *.tflite filter=lfs diff=lfs merge=lfs -text
 *.tgz filter=lfs diff=lfs merge=lfs -text
 *.wasm filter=lfs diff=lfs merge=lfs -text
@@ -33,3 +32,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.safetensors filter=lfs diff=lfs merge=lfs -text
 saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.tar.* filter=lfs diff=lfs merge=lfs -text
 *.tflite filter=lfs diff=lfs merge=lfs -text
 *.tgz filter=lfs diff=lfs merge=lfs -text
 *.wasm filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+Vídeo[[:space:]]sin[[:space:]]título[[:space:]]‐[[:space:]]Hecho[[:space:]]con[[:space:]]Clipchamp[[:space:]](1).mp4 filter=lfs diff=lfs merge=lfs -text
+demo.mp4 filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+config.ini
+__pycache__/
+.streamlit/
+secrets.toml

AssistantService.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from langchain.chat_models import ChatOpenAI
+from chains.output_format.base import chain_output_format
+from chains.code_generator.base import chain_code_generator
+import os
+class GPTAssistant():
+    def __init__(self,api_key:str):
+        os.environ['OPENAI_API_KEY'] = api_key
+        self.llm = ChatOpenAI(temperature=0, model_name='gpt-3.5-turbo', request_timeout=120, client=None)
+    def chain_response_format(self, html_content):
+        # prompt templates
+        output_format_chain = chain_output_format(self.llm)
+        # chain
+        return output_format_chain.run(html_content=html_content)
+    def chain_code_generator(self, output_format, html_content):
+        # Prompt templates
+        script_chain = chain_code_generator(self.llm)
+        return script_chain.run(output_format=output_format, html_content=html_content)

ExcecuteFunction.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import importlib
+def execute_function():
+    module = "output"
+    function = "extract_info"
+    module = importlib.import_module(module)
+    function = getattr(module, function)
+    print("returning function")
+    return function

README.md CHANGED Viewed

@@ -1,12 +1,14 @@
 ---
-title: GPT Scraping
-emoji: 🔥
-colorFrom: green
 colorTo: purple
 sdk: streamlit
-sdk_version: 1.26.0
 app_file: app.py
 pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: GPT Auto Webscraping
+emoji: 🍧
+colorFrom: gray
 colorTo: purple
 sdk: streamlit
+sdk_version: 1.21.0
 app_file: app.py
 pinned: false
+license: mit
+duplicated_from: CognitiveLabs/GPT-auto-webscraping
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,107 @@

+from AssistantService import GPTAssistant
+from openai.error import AuthenticationError
+import streamlit as st
+from langsmith.run_helpers import traceable
+import configparser
+import os
+config = configparser.ConfigParser()
+config.read('config.ini')
+if 'DEFAULT' in config:
+    assistant_api_key = config['DEFAULT'].get('API-KEY', '')
+os.environ["LANGCHAIN_TRACING_V2"]="true"
+os.environ["LANGCHAIN_ENDPOINT"]="https://api.smith.langchain.com"
+os.environ["LANGCHAIN_API_KEY"]=st.secrets["LANGCHAIN_API_KEY"]
+os.environ["LANGCHAIN_PROJECT"]=st.secrets["LANGCHAIN_PROJECT"]
+@traceable(run_type="tool")
+def start_session(session_started):
+    st.session_state['session_started'] = session_started
+    return session_started
+# change session_started to True
+if 'session_started' not in st.session_state:
+    start_session(True)
+st.write("This app helps you to extract data from HTML code using web scraping. It uses GPT-3.5-turbo to generate the code for you. \n *Contribute to this project on [GitHub](https://github.com/princeofdev/ai-gpt-scraping)*")
+with st.expander(label="Check out the video demo"):
+    yt_video = st.video("https://www.youtube.com/watch?v=_zeCun4OlCc")
+info_text = """
+**Quick start** \n
+Fill the input with <HTML code>.
+- Choose a repeating element on the page, like a product on a list.
+- Inspect the HTML code and copy the element.
+- After generating the "output format" and the code, paste the complete HTML code of the page in the last input to test it
+"""
+st.write(info_text)
+st.image("https://j.gifs.com/gpqvPl.gif", width=600)
+if assistant_api_key == '':
+    assistant_api_key = st.secrets["API_KEY"]
+    if assistant_api_key:
+        gpt_assistant = GPTAssistant(assistant_api_key)
+else:
+    gpt_assistant = GPTAssistant(assistant_api_key)
+# get the html content
+html_content = st.text_input("Paste the HTML tags of the item you want to extract:", max_chars=10000, help="example: <li>Product 1 </li>, watch the video above")
+# check if html_content is an url, and show error if it is
+if html_content:
+    if html_content.startswith("http"):
+        st.write("Please paste the HTML piece code, not the URL")
+        html_content = None
+extract_button = st.button("Generate output format & code")
+if html_content and extract_button:
+    try:
+        st.write("1/2: Generating the output format...")
+        output = gpt_assistant.chain_response_format(html_content)
+        st.session_state['output_format'] = output
+    except NameError:
+        st.write("Complete the API key field")
+    except AuthenticationError:
+        st.write("Invalid API key")
+if 'output_format' in st.session_state:
+    output_format = st.code(st.session_state['output_format'], language="json")
+    try:
+        st.write("2/2: Generating the code...")
+        python_code = gpt_assistant.chain_code_generator(st.session_state['output_format'], html_content)
+        st.session_state['code_generated'] = python_code
+        st.session_state['code_generated_exec'] = python_code + "\nresult = extract_info(html_data)"
+    except NameError:
+        st.write("Complete the API key field")
+    except AuthenticationError:
+        st.write("Invalid API key")
+@traceable(run_type="tool")
+def test_the_code(code, full_content):
+    exec(code, globals())
+    if result:
+        st.write("data extracted successfully")
+        # show data in table
+        st.table(result)
+    else:
+        st.write("error extracting data")
+    return result or "error"
+if 'code_generated' in st.session_state:
+    python_function_label = st.write("Here is your python function:")
+    code_generated = st.code(st.session_state['code_generated'],language="python")
+    full_content = st.text_input("Paste your complete HTML here:")
+    test_code = st.button("Test the code")
+    if full_content and test_code:
+        html_data = full_content
+        result = None
+        test_the_code(st.session_state['code_generated_exec'], full_content=full_content)

chains/code_generator/base.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from langchain.chains import LLMChain
+from langchain.memory import ConversationBufferMemory
+from chains.code_generator.templates import chat_script_prompt
+def chain_code_generator(llm) -> LLMChain:
+    # Memory
+    script_memory = ConversationBufferMemory(
+        input_key="output_format", memory_key="chat_history"
+    )
+    # Chain
+    return LLMChain(
+        llm=llm,
+        prompt=chat_script_prompt,
+        verbose=True,
+        output_key="script",
+        memory=script_memory,
+    )

chains/code_generator/templates.py ADDED Viewed

	@@ -0,0 +1,58 @@

+from langchain.prompts import (
+    SystemMessagePromptTemplate,
+    HumanMessagePromptTemplate,
+    ChatPromptTemplate,
+    PromptTemplate,
+)
+# Prompt templates
+system_template_script = PromptTemplate(
+    input_variables=["output_format", "html_content"],
+    template="""You are a helpful assitant that helps people create python scripts for web scraping.
+    --------------------------------
+    The example of the html content is: {html_content}
+    --------------------------------
+    You have to create a python function that extract information from an html code using web scrapping.
+    Try to select the deeper class that is common among the elements to make de find_all function.
+    Your answer SHOULD only contain the python function code without any aditional word or character.
+    Import the used libraries above the function definition.
+    The function name must be extract_info.
+    The function have to receive the html data as a parameter.
+    Your function needs to extract information for all the elements with similar attributes.
+    An element could have missing attributes
+    Before calling .text or ['href'] methods, check if the element exists.
+    ----------------
+    FINAL ANSWER EXAMPLE:
+    from bs4 import BeautifulSoup
+    def extract_info(html):
+        ...CODE...
+        return {output_format}
+    ----------------
+    Always check if the element exists before calling some method.
+    """,
+)
+human_template_script = PromptTemplate(input_variables=[], template="give me the code")
+# Chat Prompt objects
+system_template_script_prompt = SystemMessagePromptTemplate.from_template(
+    system_template_script.template
+)
+human_template_script_prompt = HumanMessagePromptTemplate.from_template(
+    human_template_script.template
+)
+chat_script_prompt = ChatPromptTemplate.from_messages(
+    [system_template_script_prompt, human_template_script_prompt]
+)

chains/output_format/base.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from langchain.chains import LLMChain
+from langchain.memory import ConversationBufferMemory
+from chains.output_format.templates import output_format_chat_prompt
+def chain_output_format(llm) -> LLMChain:
+    # memory
+    html_memory = ConversationBufferMemory(
+        input_key="html_content", memory_key="chat_history"
+    )
+    # chain
+    return LLMChain(
+        llm=llm,
+        prompt=output_format_chat_prompt,
+        verbose=True,
+        output_key="output_format",
+        memory=html_memory,
+    )

chains/output_format/templates.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate, PromptTemplate
+# prompt templates
+system_template_output_format = PromptTemplate(
+    input_variables = ['html_content'],
+    template='''You are a helpful assitant that helps people extract JSON information from HTML content.
+    The input is a HTML content.
+    The expected output is a JSON with a relevant information in the following html: {html_content}
+    Try to extract as much information as possible. Including images, links, etc.
+    The assitant answer should ONLY contain the JSON information without any aditional word or character.
+    The JSON output must have 1 depth level as much.
+    The expected output format is an array of objects.
+    ''')
+human_template_output_format = PromptTemplate(
+    input_variables = ['html_content'],
+    template='this is the html content: {html_content}'
+)
+# chat prompts objects
+system_message_prompt = SystemMessagePromptTemplate.from_template(system_template_output_format.template)
+human_message_prompt = HumanMessagePromptTemplate.from_template(human_template_output_format.template)
+output_format_chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])

config.ini.example ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ [DEFAULT]
2	+ API-KEY=OpenAI API KEY HERE

requirements.txt ADDED Viewed

	@@ -0,0 +1,74 @@

+aiohttp==3.8.4
+aiosignal==1.3.1
+altair==4.2.2
+async-timeout==4.0.2
+attrs==23.1.0
+beautifulsoup4==4.12.2
+blinker==1.6.2
+cachetools==5.3.0
+certifi==2023.5.7
+charset-normalizer==3.1.0
+click==8.1.3
+colorama==0.4.6
+dataclasses-json==0.5.7
+decorator==5.1.1
+entrypoints==0.4
+frozenlist==1.3.3
+gitdb==4.0.10
+GitPython==3.1.31
+greenlet==2.0.2
+idna==3.4
+importlib-metadata==6.6.0
+Jinja2==3.1.2
+jsonschema==4.17.3
+langchain==0.0.234
+langsmith==0.0.5
+lxml==4.9.2
+markdown-it-py==2.2.0
+MarkupSafe==2.1.2
+marshmallow==3.19.0
+marshmallow-enum==1.5.1
+mdurl==0.1.2
+MechanicalSoup==1.2.0
+multidict==6.0.4
+mypy-extensions==1.0.0
+numexpr==2.8.4
+numpy==1.24.3
+openai==0.27.6
+openapi-schema-pydantic==1.2.4
+packaging==23.1
+pandas==2.0.1
+Pillow==9.5.0
+protobuf==3.20.3
+pyarrow==12.0.0
+pydantic==1.10.7
+pydeck==0.8.1b0
+Pygments==2.15.1
+Pympler==1.0.1
+pyrsistent==0.19.3
+python-dateutil==2.8.2
+pytz==2023.3
+pytz-deprecation-shim==0.1.0.post0
+PyYAML==6.0
+requests==2.30.0
+rich==13.3.5
+six==1.16.0
+smmap==5.0.0
+soupsieve==2.4.1
+SQLAlchemy==2.0.13
+streamlit==1.22.0
+streamlit-ace==0.1.1
+tenacity==8.2.2
+toml==0.10.2
+toolz==0.12.0
+tornado==6.3.1
+tqdm==4.65.0
+typing-inspect==0.8.0
+typing_extensions==4.5.0
+tzdata==2023.3
+tzlocal==4.3
+urllib3==2.0.2
+validators==0.20.0
+watchdog==3.0.0
+yarl==1.9.2
+zipp==3.15.0

utils/SaveFunction.py ADDED Viewed

	@@ -0,0 +1,4 @@

+def save_function(code):
+    function = open('output.py', 'w')
+    function.write(code)
+    function.close()