Komal133's picture
Update app.py
c76c941 verified
import streamlit as st
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification
from simple_salesforce import Salesforce
import torch
from PyPDF2 import PdfReader
import re
import seaborn as sns
import matplotlib.pyplot as plt
# Salesforce connection
def connect_to_salesforce():
sf = Salesforce(
username='your_username',
password='your_password',
security_token='your_security_token',
domain='login' # or 'test' for sandbox
)
return sf
# Extract text from PDF
def extract_text_from_pdf(pdf_file):
reader = PdfReader(pdf_file)
text = ""
for page in reader.pages:
text += page.extract_text() + "\n"
return text
# Split text into clauses
def split_into_clauses(text):
clauses = re.split(r'\n\s*\d+\.\s*|\n\s*[A-Z]\.\s*', text)
clauses = [clause.strip() for clause in clauses if clause.strip()]
return clauses
# Load BERT model and tokenizer
@st.cache_resource
def load_model():
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3) # Fine-tuned for 3 risk levels
return tokenizer, model
# Process clauses and assign risk scores
def process_clauses(clauses, tokenizer, model):
results = []
risk_levels = {0: 'Low', 1: 'Medium', 2: 'High'}
for clause in clauses:
inputs = tokenizer(clause, return_tensors="pt", truncation=True, padding=True, max_length=512)
with torch.no_grad():
outputs = model(**inputs)
logits = outputs.logits
risk_score = torch.softmax(logits, dim=1).numpy()[0]
risk_level = risk_levels[np.argmax(risk_score)]
results.append({
'clause_text': clause,
'risk_level': risk_level,
'severity_score': float(np.max(risk_score)),
'clause_type': infer_clause_type(clause) # Simplified clause type inference
})
return results
# Simplified clause type inference (extend with more sophisticated logic as needed)
def infer_clause_type(clause):
if 'liability' in clause.lower():
return 'Liability'
elif 'payment' in clause.lower():
return 'Payment'
else:
return 'General'
# Save results to Salesforce
def save_to_salesforce(sf, results, contract_id):
for result in results:
sf.Contract_Risk__c.create({
'Contract__c': contract_id,
'Clause_Text__c': result['clause_text'][:255], # Truncate if needed
'Risk_Level__c': result['risk_level'],
'Severity_Score__c': result['severity_score'],
'Clause_Type__c': result['clause_type']
})
# Generate heatmap
def generate_heatmap(results):
df = pd.DataFrame(results)
risk_scores = df['severity_score'].values
plt.figure(figsize=(10, 2))
sns.heatmap([risk_scores], cmap='RdYlGn_r', annot=True, fmt='.2f', cbar_kws={'label': 'Risk Severity'})
plt.title('Contract Clause Risk Heatmap')
plt.xlabel('Clause Index')
plt.yticks([])
st.pyplot(plt)
# Streamlit interface
def main():
st.title("Contract Risk Analyzer")
# File upload
uploaded_file = st.file_uploader("Upload Contract PDF", type=["pdf"])
contract_id = st.text_input("Enter Contract ID")
if uploaded_file and contract_id:
# Extract and process text
text = extract_text_from_pdf(uploaded_file)
clauses = split_into_clauses(text)
# Load model and process clauses
tokenizer, model = load_model()
results = process_clauses(clauses, tokenizer, model)
# Display results
st.subheader("Clause Analysis Results")
for i, result in enumerate(results, 1):
st.write(f"**Clause {i}**")
st.write(f"Text: {result['clause_text'][:100]}...")
st.write(f"Clause Type: {result['clause_type']}")
st.write(f"Risk Level: {result['risk_level']}")
st.write(f"Severity Score: {result['severity_score']:.2f}")
st.write("---")
# Generate and display heatmap
generate_heatmap(results)
# Save to Salesforce
if st.button("Save to Salesforce"):
sf = connect_to_salesforce()
save_to_salesforce(sf, results, contract_id)
st.success("Results saved to Salesforce!")
if __name__ == "__main__":
main()