Spaces:

rolwinpinto
/

finanalyst

Sleeping

App Files Files Community

rolwinpinto commited on Aug 14

Commit

cfff27b

•

1 Parent(s): d226ff4

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -39

app.py CHANGED Viewed

@@ -3,29 +3,27 @@ import streamlit as st
 import PyPDF2
 import matplotlib.pyplot as plt
 from io import BytesIO
-from llama_index.embeddings import HuggingFaceEmbedding
-from llama_index.schema import Document
-from sklearn.metrics.pairwise import cosine_similarity
-import numpy as np
-import dotenv
 import re
 import requests
 # Load environment variables
 dotenv.load_dotenv()
-# Configure Hugging Face API
 API_URL = "https://api-inference.huggingface.co/models/sarvamai/sarvam-2b-v0.5"
 headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_API_KEY')}"}
 # Configure embedding model
-embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
 def query_huggingface_api(payload):
     response = requests.post(API_URL, headers=headers, json=payload)
     return response.json()
-def write_to_file(content, filename="./files/test.pdf"):
     os.makedirs(os.path.dirname(filename), exist_ok=True)
     with open(filename, "wb") as f:
         f.write(content)
@@ -35,10 +33,9 @@ def extract_financial_data(document_text):
         "Revenue": [],
         "Date": []
     }
     lines = document_text.split("\n")
     revenue_pattern = re.compile(r'\$?\d+(?:,\d{3})*(?:\.\d+)?')
     for i, line in enumerate(lines):
         if any(keyword in line.lower() for keyword in ["revenue", "total revenue", "sales"]):
             for j in range(i + 1, i + 6):
@@ -51,7 +48,7 @@ def extract_financial_data(document_text):
                                 financial_data["Revenue"].append(value)
                             except ValueError:
                                 continue
         if "Q1" in line or "Q2" in line or "Q3" in line or "Q4" in line or re.search(r'FY\s*\d{4}', line):
             financial_data["Date"].append(line.strip())
@@ -61,7 +58,17 @@ def extract_financial_data(document_text):
     return financial_data
-def generate_summary(document_text, query):
     prompt = f"""
     You are a financial analyst. Your task is to provide a comprehensive analysis of the financial document.
     Analyze the following document and respond to the query:
@@ -99,26 +106,6 @@ def generate_comparison_graph(data):
     plt.tight_layout()
     st.pyplot(fig)
-def search_similar_sections(document_text, query, top_k=3):
-    # Split the document into sections (you may need to adjust this based on your document structure)
-    sections = document_text.split('\n\n')
-    # Create Document objects for each section
-    documents = [Document(text=section) for section in sections]
-    # Compute embeddings for the query and all sections
-    query_embedding = embed_model.get_text_embedding(query)
-    section_embeddings = [embed_model.get_text_embedding(doc.text) for doc in documents]
-    # Compute cosine similarities
-    similarities = cosine_similarity([query_embedding], section_embeddings)[0]
-    # Get indices of top-k similar sections
-    top_indices = np.argsort(similarities)[-top_k:][::-1]
-    # Return top-k similar sections
-    return [sections[i] for i in top_indices]
 # Streamlit app
 def main():
     st.title("Fortune 500 Financial Document Analyzer")
@@ -142,20 +129,18 @@ def main():
         # Extract financial data
         financial_data = extract_financial_data(document_text)
         # Add a provision for user query input
         query = st.text_input("Enter your financial analysis query (e.g., 'What are the revenue trends?')", "")
         if query:
-            summary = generate_summary(document_text, query)
             st.write("## Financial Analysis Result")
             st.write(summary)
-            st.write("## Relevant Document Sections")
-            similar_sections = search_similar_sections(document_text, query)
-            for i, section in enumerate(similar_sections, 1):
-                st.write(f"### Section {i}")
-                st.write(section)
         # Display revenue comparison graph
         if financial_data["Revenue"] and financial_data["Date"]:
             st.write("## Revenue Comparison")

 import PyPDF2
 import matplotlib.pyplot as plt
 from io import BytesIO
+from llama_index.core import Settings, VectorStoreIndex, SimpleDirectoryReader
+from llama_index.embeddings.fastembed import FastEmbedEmbedding
 import re
 import requests
+import dotenv
 # Load environment variables
 dotenv.load_dotenv()
+# Configure Hugging Face API for Sarvam model
 API_URL = "https://api-inference.huggingface.co/models/sarvamai/sarvam-2b-v0.5"
 headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_API_KEY')}"}
 # Configure embedding model
+Settings.embed_model = FastEmbedEmbedding(model_name="BAAI/bge-small-en-v1.5")
 def query_huggingface_api(payload):
     response = requests.post(API_URL, headers=headers, json=payload)
     return response.json()
+def write_to_file(content, filename="./files/uploaded.pdf"):
     os.makedirs(os.path.dirname(filename), exist_ok=True)
     with open(filename, "wb") as f:
         f.write(content)
         "Revenue": [],
         "Date": []
     }
     lines = document_text.split("\n")
     revenue_pattern = re.compile(r'\$?\d+(?:,\d{3})*(?:\.\d+)?')
     for i, line in enumerate(lines):
         if any(keyword in line.lower() for keyword in ["revenue", "total revenue", "sales"]):
             for j in range(i + 1, i + 6):
                                 financial_data["Revenue"].append(value)
                             except ValueError:
                                 continue
         if "Q1" in line or "Q2" in line or "Q3" in line or "Q4" in line or re.search(r'FY\s*\d{4}', line):
             financial_data["Date"].append(line.strip())
     return financial_data
+def ingest_documents():
+    reader = SimpleDirectoryReader("./files/")
+    documents = reader.load_data()
+    return documents
+def load_data(documents):
+    index = VectorStoreIndex.from_documents(documents)
+    return index
+def generate_summary(index, document_text, query):
+    query_engine = index.as_query_engine()
     prompt = f"""
     You are a financial analyst. Your task is to provide a comprehensive analysis of the financial document.
     Analyze the following document and respond to the query:
     plt.tight_layout()
     st.pyplot(fig)
 # Streamlit app
 def main():
     st.title("Fortune 500 Financial Document Analyzer")
         # Extract financial data
         financial_data = extract_financial_data(document_text)
+        # Ingest documents for summarization and query-driven analysis
+        documents = ingest_documents()
+        index = load_data(documents)
         # Add a provision for user query input
         query = st.text_input("Enter your financial analysis query (e.g., 'What are the revenue trends?')", "")
         if query:
+            summary = generate_summary(index, document_text, query)
             st.write("## Financial Analysis Result")
             st.write(summary)
         # Display revenue comparison graph
         if financial_data["Revenue"] and financial_data["Date"]:
             st.write("## Revenue Comparison")