Spaces:
Sleeping
Sleeping
Prathmesh48
commited on
Commit
•
1ff214d
1
Parent(s):
433d541
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import concurrent.futures
|
3 |
+
import random
|
4 |
+
from langchain_google_genai import ChatGoogleGenerativeAI
|
5 |
+
from langchain_community.document_loaders import WebBaseLoader, PyPDFLoader
|
6 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
7 |
+
|
8 |
+
gemini = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001", google_api_key='AIzaSyBmZtXjJgp7yIAo9joNCZGSxK9PbGMcVaA', temperature=0.1)
|
9 |
+
gemini1 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001", google_api_key='AIzaSyABsaDjPujPCBlz4LLxcXDX_bDA9uEL7Xc', temperature=0.1)
|
10 |
+
|
11 |
+
def pdf_extractor(link):
|
12 |
+
text = ''
|
13 |
+
loader = PyPDFLoader(link)
|
14 |
+
pages = loader.load_and_split()
|
15 |
+
for page in pages:
|
16 |
+
text += page.page_content
|
17 |
+
return [text]
|
18 |
+
|
19 |
+
def web_extractor(link):
|
20 |
+
text = ''
|
21 |
+
loader = WebBaseLoader(link)
|
22 |
+
pages = loader.load_and_split()
|
23 |
+
for page in pages:
|
24 |
+
text += page.page_content
|
25 |
+
return [text]
|
26 |
+
|
27 |
+
def feature_extraction(tag, history, context):
|
28 |
+
prompt = f'''
|
29 |
+
You are an intelligent assistant tasked with updating product information. You have two data sources:
|
30 |
+
1. Tag_History: Previously gathered information about the product.
|
31 |
+
2. Tag_Context: New data that might contain additional details.
|
32 |
+
|
33 |
+
Your job is to read the Tag_Context and update the relevant field in the Tag_History with any new details found. The field to be updated is the {tag} FIELD.
|
34 |
+
|
35 |
+
Guidelines:
|
36 |
+
- Only add new details that are relevant to the {tag} FIELD.
|
37 |
+
- Do not add or modify any other fields in the Tag_History.
|
38 |
+
- Ensure your response is in coherent sentences, integrating the new details seamlessly into the existing information.
|
39 |
+
|
40 |
+
Here is the data:
|
41 |
+
|
42 |
+
Tag_Context: {str(context)}
|
43 |
+
Tag_History: {history}
|
44 |
+
|
45 |
+
Respond with the updated Tag_History.
|
46 |
+
'''
|
47 |
+
model = random.choice([gemini, gemini1])
|
48 |
+
result = model.invoke(prompt)
|
49 |
+
return result.content
|
50 |
+
|
51 |
+
def main(link):
|
52 |
+
history = {
|
53 |
+
"Introduction": "",
|
54 |
+
"Specifications": "",
|
55 |
+
"Product Overview": "",
|
56 |
+
"Safety Information": "",
|
57 |
+
"Installation Instructions": "",
|
58 |
+
"Setup and Configuration": "",
|
59 |
+
"Operation Instructions": "",
|
60 |
+
"Maintenance and Care": "",
|
61 |
+
"Troubleshooting": "",
|
62 |
+
"Warranty Information": "",
|
63 |
+
"Legal Information": ""
|
64 |
+
}
|
65 |
+
|
66 |
+
# Extract Text
|
67 |
+
if link.endswith('.md') or link[8:11] == 'en.':
|
68 |
+
text = web_extractor(link)
|
69 |
+
else:
|
70 |
+
text = pdf_extractor(link)
|
71 |
+
|
72 |
+
# Create Chunks
|
73 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
74 |
+
chunk_size=10000,
|
75 |
+
chunk_overlap=100,
|
76 |
+
separators=["", '', " "]
|
77 |
+
)
|
78 |
+
chunks = text_splitter.create_documents(text)
|
79 |
+
|
80 |
+
for idx, chunk in enumerate(chunks):
|
81 |
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
82 |
+
future_to_key = {
|
83 |
+
executor.submit(feature_extraction, key, history[key], chunk.page_content): key for key in history
|
84 |
+
}
|
85 |
+
for future in concurrent.futures.as_completed(future_to_key):
|
86 |
+
key = future_to_key[future]
|
87 |
+
try:
|
88 |
+
response = future.result()
|
89 |
+
history[key] = response
|
90 |
+
st.write(f"Intermediate result for {key}: {response}")
|
91 |
+
except Exception as e:
|
92 |
+
st.write(f"Error processing {key}: {e}")
|
93 |
+
|
94 |
+
return history
|
95 |
+
|
96 |
+
st.title('Extract Fields from Product Manuals')
|
97 |
+
link = st.text_input('Enter the link to the product document:')
|
98 |
+
if st.button('Process'):
|
99 |
+
if link:
|
100 |
+
final_result = main(link)
|
101 |
+
st.write('Final extracted fields/tags:')
|
102 |
+
st.json(final_result)
|
103 |
+
else:
|
104 |
+
st.write('Please enter a valid link.')
|