analyticsbyte commited on
Commit
045d75e
1 Parent(s): ad76dfc

Upload 3 files

Browse files

The invoice data extraction code

Files changed (3) hide show
  1. app.py +39 -0
  2. pipeline.py +94 -0
  3. requirements.txt +6 -0
app.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from dotenv import load_dotenv
3
+
4
+ from pipeline import create_docs
5
+
6
+
7
+
8
+ def main():
9
+ load_dotenv()
10
+
11
+ st.set_page_config(page_title="Invoice Extraction Bot")
12
+ st.title("Invoice Extraction Bot...💁 ")
13
+ st.subheader("I can help you in extracting invoice data")
14
+
15
+
16
+ # Upload the Invoices (pdf files)...
17
+ pdf = st.file_uploader("Upload invoices here, only PDF files allowed", type=["pdf"],accept_multiple_files=True)
18
+
19
+ submit=st.button("Extract Data")
20
+
21
+ if submit:
22
+ with st.spinner('Wait for it...'):
23
+ df=create_docs(pdf)
24
+ st.write(df.head())
25
+
26
+ data_as_csv= df.to_csv(index=False).encode("utf-8")
27
+ st.download_button(
28
+ "Download data as CSV",
29
+ data_as_csv,
30
+ "benchmark-tools.csv",
31
+ "text/csv",
32
+ key="download-tools-csv",
33
+ )
34
+ st.success("Hope I was able to save your time❤️")
35
+
36
+
37
+ #Invoking main function
38
+ if __name__ == '__main__':
39
+ main()
pipeline.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # from langchain.llms import OpenAI
2
+ from langchain_openai import OpenAI
3
+ from pypdf import PdfReader
4
+ from langchain.llms.openai import OpenAI
5
+ import pandas as pd
6
+ import re
7
+ import replicate
8
+ from langchain.prompts import PromptTemplate
9
+
10
+ #Extract Information from PDF file
11
+ def get_pdf_text(pdf_doc):
12
+ text = ""
13
+ pdf_reader = PdfReader(pdf_doc)
14
+ for page in pdf_reader.pages:
15
+ text += page.extract_text()
16
+ return text
17
+
18
+
19
+
20
+ #Function to extract data from text
21
+ def extracted_data(pages_data):
22
+
23
+ template = """Extract all the following values : invoice no., Description, Quantity, date,
24
+ Unit price , Amount, Total, email, phone number and address from this data: {pages}
25
+
26
+ Expected output: remove any dollar symbols {{'Invoice no.': '1001329','Description': 'Office Chair','Quantity': '2','Date': '5/4/2023','Unit price': '1100.00','Amount': '2200.00','Total': '2200.00','Email': 'Santoshvarma0988@gmail.com','Phone number': '9999999999','Address': 'Mumbai, India'}}
27
+ """
28
+ prompt_template = PromptTemplate(input_variables=["pages"], template=template)
29
+
30
+ llm = OpenAI(temperature=.7)
31
+ full_response=llm(prompt_template.format(pages=pages_data))
32
+
33
+
34
+ #The below code will be used when we want to use LLAMA 2 model, we will use Replicate for hosting our model....
35
+
36
+ #output = replicate.run('replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1',
37
+ #input={"prompt":prompt_template.format(pages=pages_data) ,
38
+ #"temperature":0.1, "top_p":0.9, "max_length":512, "repetition_penalty":1})
39
+
40
+ #full_response = ''
41
+ #for item in output:
42
+ #full_response += item
43
+
44
+
45
+ #print(full_response)
46
+ return full_response
47
+
48
+
49
+ # iterate over files in
50
+ # that user uploaded PDF files, one by one
51
+ def create_docs(user_pdf_list):
52
+
53
+ df = pd.DataFrame({'Invoice no.': pd.Series(dtype='str'),
54
+ 'Description': pd.Series(dtype='str'),
55
+ 'Quantity': pd.Series(dtype='str'),
56
+ 'Date': pd.Series(dtype='str'),
57
+ 'Unit price': pd.Series(dtype='str'),
58
+ 'Amount': pd.Series(dtype='int'),
59
+ 'Total': pd.Series(dtype='str'),
60
+ 'Email': pd.Series(dtype='str'),
61
+ 'Phone number': pd.Series(dtype='str'),
62
+ 'Address': pd.Series(dtype='str')
63
+ })
64
+
65
+ for filename in user_pdf_list:
66
+
67
+ print(filename)
68
+ raw_data=get_pdf_text(filename)
69
+ print(raw_data)
70
+ print("extracted raw data")
71
+
72
+ llm_extracted_data=extracted_data(raw_data)
73
+ print("llm extracted data")
74
+ #Adding items to our list - Adding data & its metadata
75
+
76
+ pattern = r'{(.+)}'
77
+ match = re.search(pattern, llm_extracted_data, re.DOTALL)
78
+
79
+ data_dict = {}
80
+
81
+ if match:
82
+ extracted_text = match.group(1)
83
+ # Converting the extracted text to a dictionary
84
+ data_dict = eval('{' + extracted_text + '}')
85
+ print(data_dict)
86
+ else:
87
+ print("No match found.")
88
+
89
+ df=df._append([data_dict], ignore_index=True)
90
+ print("********************DONE***************")
91
+ #df=df.append(save_to_dataframe(llm_extracted_data), ignore_index=True)
92
+
93
+ df.head()
94
+ return df
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ langchain==0.0.351
2
+ streamlit==1.29.0
3
+ openai==1.5.0
4
+ python-dotenv==1.0.0
5
+ pypdf==3.17.3
6
+ replicate==0.9.0