Spaces:

Prathmesh48
/

Extract-Fields

Sleeping

App Files Files Community

Prathmesh48 commited on Jun 5, 2024

Commit

1ff214d

verified ·

1 Parent(s): 433d541

Create app.py

Browse files

Files changed (1) hide show

app.py +104 -0

app.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import streamlit as st
+import concurrent.futures
+import random
+from langchain_google_genai import ChatGoogleGenerativeAI
+from langchain_community.document_loaders import WebBaseLoader, PyPDFLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+gemini = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001", google_api_key='AIzaSyBmZtXjJgp7yIAo9joNCZGSxK9PbGMcVaA', temperature=0.1)
+gemini1 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001", google_api_key='AIzaSyABsaDjPujPCBlz4LLxcXDX_bDA9uEL7Xc', temperature=0.1)
+def pdf_extractor(link):
+    text = ''
+    loader = PyPDFLoader(link)
+    pages = loader.load_and_split()
+    for page in pages:
+        text += page.page_content
+    return [text]
+def web_extractor(link):
+    text = ''
+    loader = WebBaseLoader(link)
+    pages = loader.load_and_split()
+    for page in pages:
+        text += page.page_content
+    return [text]
+def feature_extraction(tag, history, context):
+    prompt = f'''
+    You are an intelligent assistant tasked with updating product information. You have two data sources:
+    1. Tag_History: Previously gathered information about the product.
+    2. Tag_Context: New data that might contain additional details.
+    Your job is to read the Tag_Context and update the relevant field in the Tag_History with any new details found. The field to be updated is the {tag} FIELD.
+    Guidelines:
+    - Only add new details that are relevant to the {tag} FIELD.
+    - Do not add or modify any other fields in the Tag_History.
+    - Ensure your response is in coherent sentences, integrating the new details seamlessly into the existing information.
+    Here is the data:
+    Tag_Context: {str(context)}
+    Tag_History: {history}
+    Respond with the updated Tag_History.
+    '''
+    model = random.choice([gemini, gemini1])
+    result = model.invoke(prompt)
+    return result.content
+def main(link):
+    history = {
+        "Introduction": "",
+        "Specifications": "",
+        "Product Overview": "",
+        "Safety Information": "",
+        "Installation Instructions": "",
+        "Setup and Configuration": "",
+        "Operation Instructions": "",
+        "Maintenance and Care": "",
+        "Troubleshooting": "",
+        "Warranty Information": "",
+        "Legal Information": ""
+    }
+    # Extract Text
+    if link.endswith('.md') or link[8:11] == 'en.':
+        text = web_extractor(link)
+    else:
+        text = pdf_extractor(link)
+    # Create Chunks
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=10000,
+        chunk_overlap=100,
+        separators=["", '', " "]
+    )
+    chunks = text_splitter.create_documents(text)
+    for idx, chunk in enumerate(chunks):
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            future_to_key = {
+                executor.submit(feature_extraction, key, history[key], chunk.page_content): key for key in history
+            }
+            for future in concurrent.futures.as_completed(future_to_key):
+                key = future_to_key[future]
+                try:
+                    response = future.result()
+                    history[key] = response
+                    st.write(f"Intermediate result for {key}: {response}")
+                except Exception as e:
+                    st.write(f"Error processing {key}: {e}")
+    return history
+st.title('Extract Fields from Product Manuals')
+link = st.text_input('Enter the link to the product document:')
+if st.button('Process'):
+    if link:
+        final_result = main(link)
+        st.write('Final extracted fields/tags:')
+        st.json(final_result)
+    else:
+        st.write('Please enter a valid link.')