Spaces:
Sleeping
Sleeping
import os | |
import streamlit as st | |
import PyPDF2 | |
import matplotlib.pyplot as plt | |
from io import BytesIO | |
from llama_index.core import Settings, VectorStoreIndex, SimpleDirectoryReader | |
from llama_index.embeddings.fastembed import FastEmbedEmbedding | |
import re | |
import requests | |
import dotenv | |
# Load environment variables | |
dotenv.load_dotenv() | |
# Configure Hugging Face API for Sarvam model | |
API_URL = "https://api-inference.huggingface.co/models/sarvamai/sarvam-2b-v0.5" | |
headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_API_KEY')}"} | |
# Configure embedding model | |
Settings.embed_model = FastEmbedEmbedding(model_name="BAAI/bge-small-en-v1.5") | |
def query_huggingface_api(payload): | |
response = requests.post(API_URL, headers=headers, json=payload) | |
return response.json() | |
def write_to_file(content, filename="./files/uploaded.pdf"): | |
os.makedirs(os.path.dirname(filename), exist_ok=True) | |
with open(filename, "wb") as f: | |
f.write(content) | |
def extract_financial_data(document_text): | |
financial_data = { | |
"Revenue": [], | |
"Date": [] | |
} | |
lines = document_text.split("\n") | |
revenue_pattern = re.compile(r'\$?\d+(?:,\d{3})*(?:\.\d+)?') | |
for i, line in enumerate(lines): | |
if any(keyword in line.lower() for keyword in ["revenue", "total revenue", "sales"]): | |
for j in range(i + 1, i + 6): | |
if j < len(lines): | |
matches = revenue_pattern.findall(lines[j]) | |
if matches: | |
for match in matches: | |
try: | |
value = float(match.replace("$", "").replace(",", "")) | |
financial_data["Revenue"].append(value) | |
except ValueError: | |
continue | |
if "Q1" in line or "Q2" in line or "Q3" in line or "Q4" in line or re.search(r'FY\s*\d{4}', line): | |
financial_data["Date"].append(line.strip()) | |
min_length = min(len(financial_data["Revenue"]), len(financial_data["Date"])) | |
financial_data["Revenue"] = financial_data["Revenue"][:min_length] | |
financial_data["Date"] = financial_data["Date"][:min_length] | |
return financial_data | |
def ingest_documents(): | |
reader = SimpleDirectoryReader("./files/") | |
documents = reader.load_data() | |
return documents | |
def load_data(documents): | |
index = VectorStoreIndex.from_documents(documents) | |
return index | |
def generate_summary(index, document_text, query): | |
query_engine = index.as_query_engine() | |
prompt = f""" | |
You are a financial analyst. Your task is to provide a comprehensive analysis of the financial document. | |
Analyze the following document and respond to the query: | |
{document_text} | |
Query: {query} | |
If the query is too general, respond with: | |
Please cover the following aspects: | |
1. Revenue and profit trends | |
2. Key financial metrics | |
3. Major financial events and decisions | |
4. Comparison with previous periods | |
5. Future outlook or forecasts | |
6. Any notable financial risks or opportunities | |
Provide a clear, concise, and professional response. | |
""" | |
response = query_huggingface_api({"inputs": prompt}) | |
return response[0]["generated_text"] if response and isinstance(response, list) else "No response from model." | |
def generate_comparison_graph(data): | |
if not data["Date"] or not data["Revenue"]: | |
st.write("Insufficient data for generating the revenue comparison graph.") | |
return | |
fig, ax = plt.subplots(figsize=(10, 6)) | |
ax.plot(data["Date"], data["Revenue"], marker="o", linestyle="-", color="b", label="Revenue") | |
ax.set_title("Revenue Comparison") | |
ax.set_xlabel("Date") | |
ax.set_ylabel("Revenue (in millions)") | |
ax.grid(True) | |
ax.legend() | |
plt.xticks(rotation=45, ha="right") | |
plt.tight_layout() | |
st.pyplot(fig) | |
# Streamlit app | |
def main(): | |
st.title("Fortune 500 Financial Document Analyzer") | |
st.write("Upload a financial document, ask questions, and get detailed analysis!") | |
uploaded_file = st.file_uploader("Choose a financial document file", type=["pdf", "txt"]) | |
if uploaded_file is not None: | |
if uploaded_file.type == "application/pdf": | |
pdf_reader = PyPDF2.PdfReader(BytesIO(uploaded_file.getvalue())) | |
document_text = "" | |
for page in pdf_reader.pages: | |
document_text += page.extract_text() | |
else: | |
document_text = uploaded_file.getvalue().decode("utf-8") | |
write_to_file(uploaded_file.getvalue()) | |
st.write("Analyzing financial document...") | |
# Extract financial data | |
financial_data = extract_financial_data(document_text) | |
# Ingest documents for summarization and query-driven analysis | |
documents = ingest_documents() | |
index = load_data(documents) | |
# Add a provision for user query input | |
query = st.text_input("Enter your financial analysis query (e.g., 'What are the revenue trends?')", "") | |
if query: | |
summary = generate_summary(index, document_text, query) | |
st.write("## Financial Analysis Result") | |
st.write(summary) | |
# Display revenue comparison graph | |
if financial_data["Revenue"] and financial_data["Date"]: | |
st.write("## Revenue Comparison") | |
generate_comparison_graph(financial_data) | |
else: | |
st.write("No revenue data found for comparison.") | |
if __name__ == "__main__": | |
main() |