Legally-Finance / app.py
rolwinpinto's picture
Update app.py
7c52f56 verified
import os
import PyPDF2
import matplotlib.pyplot as plt
from io import BytesIO
from llama_index.core import Settings, VectorStoreIndex, SimpleDirectoryReader
from llama_index.embeddings.fastembed import FastEmbedEmbedding
from llama_index.llms.gemini import Gemini
import re
import streamlit as st
# Configure Google Gemini
Settings.embed_model = FastEmbedEmbedding(model_name="BAAI/bge-small-en-v1.5")
Settings.llm = Gemini(api_key=os.getenv("GOOGLE_API_KEY"), temperature=0.5, model_name="models/gemini-pro")
def write_to_file(content, filename="./files/uploaded.pdf"):
os.makedirs(os.path.dirname(filename), exist_ok=True)
with open(filename, "wb") as f:
f.write(content)
def extract_financial_data(document_text):
"""
Extracts financial data from the text of the document.
"""
financial_data = {
"Revenue": [],
"Date": []
}
lines = document_text.split("\n")
revenue_pattern = re.compile(r'\$?\d+(?:,\d{3})*(?:\.\d+)?')
for i, line in enumerate(lines):
# Check for revenue-related keywords
if any(keyword in line.lower() for keyword in ["revenue", "total revenue", "sales"]):
# Attempt to extract numbers from the following lines
for j in range(i + 1, i + 6): # Look ahead a few lines for potential numbers
matches = revenue_pattern.findall(lines[j])
if matches:
for match in matches:
try:
value = float(match.replace("$", "").replace(",", ""))
financial_data["Revenue"].append(value)
except ValueError:
continue
# Check for date-related lines
if "Q1" in line or "Q2" in line or "Q3" in line or "Q4" in line or re.search(r'FY\s*\d{4}', line):
financial_data["Date"].append(line.strip())
# Ensure the data lists are of equal length
min_length = min(len(financial_data["Revenue"]), len(financial_data["Date"]))
financial_data["Revenue"] = financial_data["Revenue"][:min_length]
financial_data["Date"] = financial_data["Date"][:min_length]
return financial_data
def ingest_documents():
reader = SimpleDirectoryReader("./files/")
documents = reader.load_data()
return documents
def load_data(documents):
index = VectorStoreIndex.from_documents(documents)
return index
def generate_summary(index, document_text, query, target_language):
query_engine = index.as_query_engine()
# Instruct the LLM to translate the query to English and generate the response
response = query_engine.query(f"""
You are a financial analyst and translator. Your task is to translate the following query into English,
analyze the financial document based on the translated query, and then translate the response back into {target_language}.
Query: {query}
Document: {document_text}
Please cover the following aspects:
1. Revenue and profit trends
2. Key financial metrics
3. Comparison with previous periods
4. Future outlook or forecasts
5. Any notable financial risks or opportunities
Provide a clear, concise, and professional response in {target_language}.
""")
return response.response
def generate_comparison_graph(data):
if not data["Date"] or not data["Revenue"]:
st.write("Insufficient data for generating the revenue comparison graph.")
return
fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(data["Date"], data["Revenue"], marker="o", linestyle="-", color="b", label="Revenue")
ax.set_title("Revenue Comparison")
ax.set_xlabel("Date")
ax.set_ylabel("Revenue (in millions)")
ax.grid(True)
ax.legend()
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
st.pyplot(fig)
# Streamlit app
def main():
st.title("Fortune 500 Financial Document Analyzer")
st.write("Upload a financial document, ask questions in your preferred language, and get detailed analysis!")
uploaded_file = st.file_uploader("Choose a financial document file", type=["pdf"])
# Add language selection
languages = {
'English': 'en',
'Hindi': 'hi',
'Kannada': 'kn',
'Spanish': 'es',
'French': 'fr',
'German': 'de',
}
selected_language = st.selectbox("Select your preferred language", list(languages.keys()))
target_language = languages[selected_language]
if uploaded_file is not None:
if uploaded_file.type == "application/pdf":
pdf_reader = PyPDF2.PdfReader(BytesIO(uploaded_file.getvalue()))
document_text = ""
for page in pdf_reader.pages:
document_text += page.extract_text()
else:
document_text = uploaded_file.getvalue().decode("utf-8")
write_to_file(uploaded_file.getvalue())
st.write("Analyzing financial document...")
# Extract financial data
financial_data = extract_financial_data(document_text)
# Ingest documents for summarization and query-driven analysis
documents = ingest_documents()
index = load_data(documents)
# Modify the query input to use the selected language
query = st.text_input(f"Enter your financial analysis query in {selected_language}", "")
if query:
summary = generate_summary(index, document_text, query, target_language)
st.write(f"## Financial Analysis Result (in {selected_language})")
st.write(summary)
# Display revenue comparison graph
if financial_data["Revenue"] and financial_data["Date"]:
st.write("## Revenue Comparison")
generate_comparison_graph(financial_data)
else:
st.write("No revenue data found for comparison.")
if __name__ == "__main__":
main()