kalvjam commited on
Commit
7522b53
·
1 Parent(s): a1aa58d

Adding logging

Browse files
Files changed (1) hide show
  1. app.py +38 -26
app.py CHANGED
@@ -12,11 +12,13 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
12
  from langchain.prompts import PromptTemplate
13
  from langchain.chains.summarize import load_summarize_chain
14
  from langchain.callbacks import get_openai_callback
 
15
 
16
  chkey = os.environ["API_TOKEN"]
17
  token = b64encode(f"{chkey}".encode('utf-8')).decode("ascii")
18
 
19
  with gr.Blocks(gr.themes.Soft()) as demo:
 
20
  # Create input Text search box
21
  input_box = gr.Textbox(label="Input search string for a UK Company Name")
22
  # State variable to store the Document ID
@@ -38,7 +40,7 @@ with gr.Blocks(gr.themes.Soft()) as demo:
38
  display_filing_doc_info = gr.Textbox(label="",interactive=False, visible=False)
39
 
40
  # OpenAPI Key Input box
41
- openapi_key_input = gr.Textbox(label="OpenAI API Key",type='password',interactive=True, visible=False)
42
 
43
  # Button to initiate the processing of the Document - OPENAI Call initiated here too
44
  process_filing_btn = gr.Button("Summarize the Account filing", visible=False)
@@ -54,13 +56,17 @@ with gr.Blocks(gr.themes.Soft()) as demo:
54
 
55
  # Function that does the Company search based on a search string. Gets the top 10 results
56
  def company_search(text):
 
 
57
  url = "https://api.company-information.service.gov.uk/advanced-search/companies?company_name_includes=" + text + "&company_status=active&size=10"
 
58
  auth = f'Basic {token}'
59
  payload={}
60
  headers = {
61
  'Authorization': auth
62
  }
63
  response = requests.request("GET", url, headers=headers, data=payload)
 
64
  select_resp = []
65
  if response.status_code == 200:
66
  resp = json.loads(response.text)
@@ -69,6 +75,8 @@ with gr.Blocks(gr.themes.Soft()) as demo:
69
  for key, value in comp["registered_office_address"].items():
70
  addr.append(value)
71
  select_resp.append(comp["company_number"] + " : " + comp["company_name"] + " : " + ', '.join(addr))
 
 
72
  return {output_col: gr.update(visible=True), company_list_box: gr.update(choices=select_resp,interactive=True)}
73
  else:
74
  select_resp.append("No matching companies found")
@@ -76,9 +84,11 @@ with gr.Blocks(gr.themes.Soft()) as demo:
76
 
77
  # Function to get the Filing information of a selected company
78
  def company_selected(selected_company, docid):
 
 
79
  regid = selected_company.split(' : ')[0]
80
  filings_url = "https://api.company-information.service.gov.uk/company/" + regid + "/filing-history?category=accounts&items_per_page=1"
81
- print(filings_url)
82
  auth = f'Basic {token}'
83
  payload={}
84
  headers = {
@@ -86,26 +96,30 @@ with gr.Blocks(gr.themes.Soft()) as demo:
86
  }
87
  response = requests.request("GET", filings_url, headers=headers, data=payload)
88
  resp = json.loads(response.text)
89
- #print(resp["items"])
90
- if len(resp["items"])>0:
91
- resp_value = f'Latest filing done on {resp["items"][0]["date"]}.'
92
- if "links" in resp["items"][0]:
93
- if "document_metadata" in resp["items"][0]["links"]:
94
- docid = resp["items"][0]["links"]["document_metadata"].rsplit('/',1)[-1]
95
- return {display_filing: gr.Textbox.update(visible=True, value=resp_value), submit_btn: gr.update(visible=True), doc_id : docid}
 
 
 
 
96
  else:
97
- resp_value += "But Document Metadata is not available."
98
  return {display_filing: gr.Textbox.update(visible=True, value=resp_value), submit_btn: gr.update(visible=False), doc_id : "None"}
99
  else:
100
- resp_value += "But Links to the filing not available."
101
- return {display_filing: gr.Textbox.update(visible=True, value=resp_value), submit_btn: gr.update(visible=False), doc_id : "None"}
102
  else:
103
  return {display_filing: gr.Textbox.update(visible=True, value="No record of accounts filed for the company"), submit_btn: gr.update(visible=False), doc_id : "None"}
104
 
105
  # Function to get the Filing document related to the latest Annual Account filing
106
  def get_filing(docid):
 
107
  doc_url = "https://document-api.company-information.service.gov.uk/document/" + docid + "/content"
108
- print(doc_url)
109
  auth = f'Basic {token}'
110
  payload={}
111
  headers = {
@@ -113,9 +127,8 @@ with gr.Blocks(gr.themes.Soft()) as demo:
113
  'Accept': 'application/pdf'
114
  }
115
  response = requests.request("GET", doc_url, headers=headers, data=payload)
116
- #print(response.text)
117
  content_type = response.headers['Content-Type']
118
- print(content_type)
119
  resp_value = f'Filing document is of type {content_type}. '
120
  if content_type == 'application/pdf':
121
  filename = f'doc_{docid}.pdf'
@@ -124,19 +137,16 @@ with gr.Blocks(gr.themes.Soft()) as demo:
124
  f.write(response.content)
125
  pdf_document = fitz.open(filepath)
126
  resp_value += f'PDF saved as: {filename}. There are a total of {pdf_document.page_count} pages'
127
- print(resp_value)
128
  return {display_filing_doc_info: gr.Textbox.update(visible=True, value=resp_value), process_filing_btn: gr.update(visible=True), openapi_key_input: gr.update(visible=True), processed_info: gr.update(visible=True), doc_id : docid}
129
- elif content_type == 'application/xhtml+xml':
130
- print('Work in progress to process these type of filings')
131
- resp_value += 'Work in progress to process these type of filings'
132
- return {display_filing_doc_info: gr.Textbox.update(visible=True, value=resp_value), process_filing_btn: gr.update(visible=False), openapi_key_input: gr.update(visible=False), processed_info: gr.update(visible=False), doc_id : "None"}
133
  else:
134
- print('Work in progress to process these type of filings')
135
  resp_value += 'Work in progress to process these type of filings'
 
136
  return {display_filing_doc_info: gr.Textbox.update(visible=True, value=resp_value), process_filing_btn: gr.update(visible=False), openapi_key_input: gr.update(visible=False), processed_info: gr.update(visible=False), doc_id : "None"}
137
 
138
  # Function to initial the Langchain chain with call to OPENAI to Summarize the Annual report
139
  def langchain_summarize(contents,openai_api_key):
 
140
  concatenated_content = '`n`n'.join(contents)
141
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1200,chunk_overlap=20,length_function=len)
142
  docs = text_splitter.create_documents([concatenated_content])
@@ -158,6 +168,7 @@ with gr.Blocks(gr.themes.Soft()) as demo:
158
 
159
  # Function to extract text from the document and call the LangChain processing function with the text array of pages
160
  def process_filing(docid, openai_api_key, progress=gr.Progress()):
 
161
  progress(0,desc="Starting...")
162
  filepath = f'./data/doc_{docid}.pdf'
163
  pdf_document = fitz.open(filepath)
@@ -180,28 +191,29 @@ with gr.Blocks(gr.themes.Soft()) as demo:
180
  # save list to file
181
  with open(text_path, 'wb') as f:
182
  pickle.dump(contents, f)
183
- #print(contents)
184
  resp_value = f'Total of {pdf_document.page_count} pages processed. '
185
  summary_path = f'./summary/doc_{docid}.txt'
186
  if os.path.exists(summary_path):
187
  with open(summary_path, 'r') as f:
188
  summary = f.read()
189
- print(resp_value)
190
  return {processed_info: gr.Textbox.update(visible=True, value=resp_value), summary_text: gr.Textbox.update(visible=True, value=summary)}
191
  else:
192
- #progress(track_tqdm=True,desc="Calling OpenAI to summarize...")
193
  try:
194
  summary, tkn_text = langchain_summarize(contents, openai_api_key)
195
  resp_value += tkn_text
 
196
  with open(summary_path, 'wb') as f:
197
  f.write(summary.encode())
198
  return {processed_info: gr.Textbox.update(visible=True, value=resp_value), summary_text: gr.Textbox.update(visible=True, value=summary)}
199
  except Exception as e:
200
- print(e)
201
  resp_value += 'LLM Call failed. Please check the OpenAI key again'
 
202
  return {processed_info: gr.Textbox.update(visible=True, value=resp_value), summary_text: gr.Textbox.update(visible=False)}
203
  finally:
204
- print(resp_value)
205
 
206
  def clear_screen():
207
  return {output_col: gr.update(visible=False),display_filing: gr.Textbox.update(visible=False),submit_btn: gr.update(visible=False), display_filing_doc_info:gr.update(visible=False), process_filing_btn:gr.update(visible=False),openapi_key_input:gr.update(visible=False),processed_info:gr.update(visible=False),summary_text:gr.update(visible=False)}
 
12
  from langchain.prompts import PromptTemplate
13
  from langchain.chains.summarize import load_summarize_chain
14
  from langchain.callbacks import get_openai_callback
15
+ import logging
16
 
17
  chkey = os.environ["API_TOKEN"]
18
  token = b64encode(f"{chkey}".encode('utf-8')).decode("ascii")
19
 
20
  with gr.Blocks(gr.themes.Soft()) as demo:
21
+ logging.info("*** App Starting ***")
22
  # Create input Text search box
23
  input_box = gr.Textbox(label="Input search string for a UK Company Name")
24
  # State variable to store the Document ID
 
40
  display_filing_doc_info = gr.Textbox(label="",interactive=False, visible=False)
41
 
42
  # OpenAPI Key Input box
43
+ openapi_key_input = gr.Textbox(label="OpenAI API Key", type='password', interactive=True, visible=False)
44
 
45
  # Button to initiate the processing of the Document - OPENAI Call initiated here too
46
  process_filing_btn = gr.Button("Summarize the Account filing", visible=False)
 
56
 
57
  # Function that does the Company search based on a search string. Gets the top 10 results
58
  def company_search(text):
59
+ logging.info("*** New Search Starting ***")
60
+ logging.info(f'Search term : {text}')
61
  url = "https://api.company-information.service.gov.uk/advanced-search/companies?company_name_includes=" + text + "&company_status=active&size=10"
62
+ logging.info(f'Calling Companies House API Advanced search : {url}')
63
  auth = f'Basic {token}'
64
  payload={}
65
  headers = {
66
  'Authorization': auth
67
  }
68
  response = requests.request("GET", url, headers=headers, data=payload)
69
+ logging.info(f'API Response Code : {response.status_code}')
70
  select_resp = []
71
  if response.status_code == 200:
72
  resp = json.loads(response.text)
 
75
  for key, value in comp["registered_office_address"].items():
76
  addr.append(value)
77
  select_resp.append(comp["company_number"] + " : " + comp["company_name"] + " : " + ', '.join(addr))
78
+ resp_joined = (','.join(select_resp))
79
+ logging.info(f'Response list : {resp_joined}')
80
  return {output_col: gr.update(visible=True), company_list_box: gr.update(choices=select_resp,interactive=True)}
81
  else:
82
  select_resp.append("No matching companies found")
 
84
 
85
  # Function to get the Filing information of a selected company
86
  def company_selected(selected_company, docid):
87
+ logging.info("* Company selected. Getting Filing History *")
88
+ logging.info(f'User Selection : {selected_company}')
89
  regid = selected_company.split(' : ')[0]
90
  filings_url = "https://api.company-information.service.gov.uk/company/" + regid + "/filing-history?category=accounts&items_per_page=1"
91
+ logging.info(f'Calling Companies House API Filings Endpoint : {filings_url}')
92
  auth = f'Basic {token}'
93
  payload={}
94
  headers = {
 
96
  }
97
  response = requests.request("GET", filings_url, headers=headers, data=payload)
98
  resp = json.loads(response.text)
99
+ logging.info(f'API Response Code : {response.status_code}')
100
+ if response.status_code == 200:
101
+ if len(resp["items"])>0:
102
+ resp_value = f'Latest filing done on {resp["items"][0]["date"]}.'
103
+ if "links" in resp["items"][0]:
104
+ if "document_metadata" in resp["items"][0]["links"]:
105
+ docid = resp["items"][0]["links"]["document_metadata"].rsplit('/',1)[-1]
106
+ return {display_filing: gr.Textbox.update(visible=True, value=resp_value), submit_btn: gr.update(visible=True), doc_id : docid}
107
+ else:
108
+ resp_value += "But Document Metadata is not available."
109
+ return {display_filing: gr.Textbox.update(visible=True, value=resp_value), submit_btn: gr.update(visible=False), doc_id : "None"}
110
  else:
111
+ resp_value += "But Links to the filing not available."
112
  return {display_filing: gr.Textbox.update(visible=True, value=resp_value), submit_btn: gr.update(visible=False), doc_id : "None"}
113
  else:
114
+ return {display_filing: gr.Textbox.update(visible=True, value="No record of accounts filed for the company"), submit_btn: gr.update(visible=False), doc_id : "None"}
 
115
  else:
116
  return {display_filing: gr.Textbox.update(visible=True, value="No record of accounts filed for the company"), submit_btn: gr.update(visible=False), doc_id : "None"}
117
 
118
  # Function to get the Filing document related to the latest Annual Account filing
119
  def get_filing(docid):
120
+ logging.info("* Getting Filing Document for latest filing *")
121
  doc_url = "https://document-api.company-information.service.gov.uk/document/" + docid + "/content"
122
+ logging.info(f'Calling Companies House Documents API : {doc_url}')
123
  auth = f'Basic {token}'
124
  payload={}
125
  headers = {
 
127
  'Accept': 'application/pdf'
128
  }
129
  response = requests.request("GET", doc_url, headers=headers, data=payload)
130
+ logging.info(f'API Response Code : {response.status_code}')
131
  content_type = response.headers['Content-Type']
 
132
  resp_value = f'Filing document is of type {content_type}. '
133
  if content_type == 'application/pdf':
134
  filename = f'doc_{docid}.pdf'
 
137
  f.write(response.content)
138
  pdf_document = fitz.open(filepath)
139
  resp_value += f'PDF saved as: {filename}. There are a total of {pdf_document.page_count} pages'
140
+ logging.info(resp_value)
141
  return {display_filing_doc_info: gr.Textbox.update(visible=True, value=resp_value), process_filing_btn: gr.update(visible=True), openapi_key_input: gr.update(visible=True), processed_info: gr.update(visible=True), doc_id : docid}
 
 
 
 
142
  else:
 
143
  resp_value += 'Work in progress to process these type of filings'
144
+ logging.info(resp_value)
145
  return {display_filing_doc_info: gr.Textbox.update(visible=True, value=resp_value), process_filing_btn: gr.update(visible=False), openapi_key_input: gr.update(visible=False), processed_info: gr.update(visible=False), doc_id : "None"}
146
 
147
  # Function to initial the Langchain chain with call to OPENAI to Summarize the Annual report
148
  def langchain_summarize(contents,openai_api_key):
149
+ logging.info("* Calling Langchain / OPENAI to get the summary *")
150
  concatenated_content = '`n`n'.join(contents)
151
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1200,chunk_overlap=20,length_function=len)
152
  docs = text_splitter.create_documents([concatenated_content])
 
168
 
169
  # Function to extract text from the document and call the LangChain processing function with the text array of pages
170
  def process_filing(docid, openai_api_key, progress=gr.Progress()):
171
+ logging.info("* Processing the filing document *")
172
  progress(0,desc="Starting...")
173
  filepath = f'./data/doc_{docid}.pdf'
174
  pdf_document = fitz.open(filepath)
 
191
  # save list to file
192
  with open(text_path, 'wb') as f:
193
  pickle.dump(contents, f)
194
+
195
  resp_value = f'Total of {pdf_document.page_count} pages processed. '
196
  summary_path = f'./summary/doc_{docid}.txt'
197
  if os.path.exists(summary_path):
198
  with open(summary_path, 'r') as f:
199
  summary = f.read()
200
+ logging.info(resp_value)
201
  return {processed_info: gr.Textbox.update(visible=True, value=resp_value), summary_text: gr.Textbox.update(visible=True, value=summary)}
202
  else:
 
203
  try:
204
  summary, tkn_text = langchain_summarize(contents, openai_api_key)
205
  resp_value += tkn_text
206
+ logging.info(resp_value)
207
  with open(summary_path, 'wb') as f:
208
  f.write(summary.encode())
209
  return {processed_info: gr.Textbox.update(visible=True, value=resp_value), summary_text: gr.Textbox.update(visible=True, value=summary)}
210
  except Exception as e:
211
+ logging.info(e)
212
  resp_value += 'LLM Call failed. Please check the OpenAI key again'
213
+ logging.info(resp_value)
214
  return {processed_info: gr.Textbox.update(visible=True, value=resp_value), summary_text: gr.Textbox.update(visible=False)}
215
  finally:
216
+ logging.info(resp_value)
217
 
218
  def clear_screen():
219
  return {output_col: gr.update(visible=False),display_filing: gr.Textbox.update(visible=False),submit_btn: gr.update(visible=False), display_filing_doc_info:gr.update(visible=False), process_filing_btn:gr.update(visible=False),openapi_key_input:gr.update(visible=False),processed_info:gr.update(visible=False),summary_text:gr.update(visible=False)}