Prathmesh48 commited on
Commit
1ff214d
1 Parent(s): 433d541

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +104 -0
app.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import concurrent.futures
3
+ import random
4
+ from langchain_google_genai import ChatGoogleGenerativeAI
5
+ from langchain_community.document_loaders import WebBaseLoader, PyPDFLoader
6
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
7
+
8
+ gemini = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001", google_api_key='AIzaSyBmZtXjJgp7yIAo9joNCZGSxK9PbGMcVaA', temperature=0.1)
9
+ gemini1 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001", google_api_key='AIzaSyABsaDjPujPCBlz4LLxcXDX_bDA9uEL7Xc', temperature=0.1)
10
+
11
+ def pdf_extractor(link):
12
+ text = ''
13
+ loader = PyPDFLoader(link)
14
+ pages = loader.load_and_split()
15
+ for page in pages:
16
+ text += page.page_content
17
+ return [text]
18
+
19
+ def web_extractor(link):
20
+ text = ''
21
+ loader = WebBaseLoader(link)
22
+ pages = loader.load_and_split()
23
+ for page in pages:
24
+ text += page.page_content
25
+ return [text]
26
+
27
+ def feature_extraction(tag, history, context):
28
+ prompt = f'''
29
+ You are an intelligent assistant tasked with updating product information. You have two data sources:
30
+ 1. Tag_History: Previously gathered information about the product.
31
+ 2. Tag_Context: New data that might contain additional details.
32
+
33
+ Your job is to read the Tag_Context and update the relevant field in the Tag_History with any new details found. The field to be updated is the {tag} FIELD.
34
+
35
+ Guidelines:
36
+ - Only add new details that are relevant to the {tag} FIELD.
37
+ - Do not add or modify any other fields in the Tag_History.
38
+ - Ensure your response is in coherent sentences, integrating the new details seamlessly into the existing information.
39
+
40
+ Here is the data:
41
+
42
+ Tag_Context: {str(context)}
43
+ Tag_History: {history}
44
+
45
+ Respond with the updated Tag_History.
46
+ '''
47
+ model = random.choice([gemini, gemini1])
48
+ result = model.invoke(prompt)
49
+ return result.content
50
+
51
+ def main(link):
52
+ history = {
53
+ "Introduction": "",
54
+ "Specifications": "",
55
+ "Product Overview": "",
56
+ "Safety Information": "",
57
+ "Installation Instructions": "",
58
+ "Setup and Configuration": "",
59
+ "Operation Instructions": "",
60
+ "Maintenance and Care": "",
61
+ "Troubleshooting": "",
62
+ "Warranty Information": "",
63
+ "Legal Information": ""
64
+ }
65
+
66
+ # Extract Text
67
+ if link.endswith('.md') or link[8:11] == 'en.':
68
+ text = web_extractor(link)
69
+ else:
70
+ text = pdf_extractor(link)
71
+
72
+ # Create Chunks
73
+ text_splitter = RecursiveCharacterTextSplitter(
74
+ chunk_size=10000,
75
+ chunk_overlap=100,
76
+ separators=["", '', " "]
77
+ )
78
+ chunks = text_splitter.create_documents(text)
79
+
80
+ for idx, chunk in enumerate(chunks):
81
+ with concurrent.futures.ThreadPoolExecutor() as executor:
82
+ future_to_key = {
83
+ executor.submit(feature_extraction, key, history[key], chunk.page_content): key for key in history
84
+ }
85
+ for future in concurrent.futures.as_completed(future_to_key):
86
+ key = future_to_key[future]
87
+ try:
88
+ response = future.result()
89
+ history[key] = response
90
+ st.write(f"Intermediate result for {key}: {response}")
91
+ except Exception as e:
92
+ st.write(f"Error processing {key}: {e}")
93
+
94
+ return history
95
+
96
+ st.title('Extract Fields from Product Manuals')
97
+ link = st.text_input('Enter the link to the product document:')
98
+ if st.button('Process'):
99
+ if link:
100
+ final_result = main(link)
101
+ st.write('Final extracted fields/tags:')
102
+ st.json(final_result)
103
+ else:
104
+ st.write('Please enter a valid link.')