Spaces:

mgmtprofessor
/

risk_factors_scoring

Sleeping

App Files Files Community

Update app.py

by simpsonjj - opened Oct 16, 2024

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+123

-123

Files changed (1) hide show

app.py +123 -123

app.py CHANGED Viewed

@@ -1,123 +1,123 @@
-import os
-import streamlit as st
-import torch
-import pandas as pd
-import time
-from tqdm import tqdm
-from transformers import AutoModelForSequenceClassification, AutoTokenizer
-# Set up Streamlit app
-st.title("Document Scoring App for Various Risk Factors Categories")
-# Hugging Face model directories
-model_directories = {
-    'finance': 'mgmtprofessor/finance_risk_factors',
-    'accounting': 'mgmtprofessor/accounting_risk_factors',
-    'technology': 'mgmtprofessor/technology_risk_factors',
-    'international': 'mgmtprofessor/international_risk_factors',
-    'operations': 'mgmtprofessor/operations_risk_factors',
-    'marketing': 'mgmtprofessor/marketing_risk_factors',
-    'management': 'mgmtprofessor/management_risk_factors',
-    'legal': 'mgmtprofessor/legal_risk_factors'
-}
-# Check if CUDA is available
-use_cuda = torch.cuda.is_available()
-# Function to load a model from Hugging Face
-def load_model(category):
-    try:
-        # Load the model from Hugging Face based on the category
-        model_name = model_directories.get(category)
-        if model_name:
-            tokenizer = AutoTokenizer.from_pretrained(model_name)
-            model = AutoModelForSequenceClassification.from_pretrained(model_name)
-            return model, tokenizer
-        else:
-            st.error(f"No Hugging Face model found for {category}")
-            return None, None
-    except Exception as e:
-        st.error(f"Failed to load model for {category}: {e}")
-        return None, None
-# Function to score a document and return the prediction and probability for class '1'
-def score_document(model, tokenizer, text_data):
-    if isinstance(text_data, str):
-        text_data = [text_data]
-    # Tokenize the input
-    inputs = tokenizer(text_data, return_tensors="pt", padding=True, truncation=True)
-    # Perform the prediction
-    with torch.no_grad():
-        outputs = model(**inputs)
-    # Get probabilities (softmax)
-    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
-    # Get the prediction (class with highest probability)
-    predictions = torch.argmax(probabilities, dim=1)
-    # Get the probability associated with class '1'
-    probability_class_1 = probabilities[:, 1].item()
-    return predictions.item(), probability_class_1
-# Let the user upload a file
-doc_file = st.file_uploader("Upload a document (.txt)", type=["txt"])
-# Track the start time
-start_time = time.time()
-# Make predictions when a file is uploaded
-if doc_file is not None:
-    # Read the content of the uploaded .txt file
-    text_data = doc_file.read().decode("utf-8")
-    # Initialize an empty DataFrame for results
-    result_df = pd.DataFrame(columns=["Category", "Prediction", "Probability"])
-    # Progress bar
-    progress_bar = st.progress(0)
-    total_categories = len(model_directories)
-    for i, category in enumerate(tqdm(model_directories.keys(), desc="Scoring documents")):
-        # Load the pre-trained model for the current category
-        model, tokenizer = load_model(category)
-        # Skip the category if model loading fails
-        if model is not None:
-            # Score the document
-            prediction, probability = score_document(model, tokenizer, text_data)
-            # Create a DataFrame for the current result
-            new_row = pd.DataFrame({
-                "Category": [category],
-                "Prediction": [prediction],
-                "Probability": [probability]
-            })
-            # Use pd.concat to append the new row to the DataFrame
-            result_df = pd.concat([result_df, new_row], ignore_index=True)
-        # Update the progress bar
-        progress_bar.progress((i + 1) / total_categories)
-        # Estimate remaining time
-        elapsed_time = time.time() - start_time
-        estimated_total_time = (elapsed_time / (i + 1)) * total_categories
-        st.write(f"Elapsed time: {elapsed_time:.2f}s, Estimated time remaining: {estimated_total_time - elapsed_time:.2f}s")
-    # Save results to CSV
-    csv = result_df.to_csv(index=False).encode('utf-8')
-    st.download_button(
-        label="Download results as CSV",
-        data=csv,
-        file_name="document_scoring_results.csv",
-        mime="text/csv",
-    )
-    # Display completion message
-    st.success("Document scoring complete!")
-st.write("Note: Ensure the uploaded document is in .txt format containing text data.")

+import os
+import streamlit as st
+import torch
+import pandas as pd
+import time
+from tqdm import tqdm
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+# Set up Streamlit app
+st.title("Document Scoring App for Various Risk Factors Categories")
+# Hugging Face model directories
+model_directories = {
+    'finance': 'mgmtprofessor/finance_risk_factors',
+    'accounting': 'mgmtprofessor/accounting_risk_factors',
+    'technology': 'mgmtprofessor/technology_risk_factors',
+    'international': 'mgmtprofessor/international_risk_factors',
+    'operations': 'mgmtprofessor/operations_risk_factors',
+    'marketing': 'mgmtprofessor/marketing_risk_factors',
+    'management': 'mgmtprofessor/management_risk_factors',
+    'legal': 'mgmtprofessor/legal_risk_factors'
+}
+# Check if CUDA is available
+use_cuda = torch.cuda.is_available()
+# Function to load a model from Hugging Face
+def load_model(category):
+    try:
+        # Load the model from Hugging Face based on the category
+        model_name = model_directories.get(category)
+        if model_name:
+            tokenizer = AutoTokenizer.from_pretrained(model_name)
+            model = AutoModelForSequenceClassification.from_pretrained(model_name)
+            return model, tokenizer
+        else:
+            st.error(f"No Hugging Face model found for {category}")
+            return None, None
+    except Exception as e:
+        st.error(f"Failed to load model for {category}: {e}")
+        return None, None
+# Function to score a document and return the prediction and probability for class '1'
+def score_document(model, tokenizer, text_data):
+    if isinstance(text_data, str):
+        text_data = [text_data]
+    # Tokenize the input
+    inputs = tokenizer(text_data, return_tensors="pt", padding=True, truncation=True)
+    # Perform the prediction
+    with torch.no_grad():
+        outputs = model(**inputs)
+    # Get probabilities (softmax)
+    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
+    # Get the prediction (class with highest probability)
+    predictions = torch.argmax(probabilities, dim=1)
+    # Get the probability associated with class '1'
+    probability_class_1 = probabilities[:, 1].item()
+    return predictions.item(), probability_class_1
+# Let the user upload a file
+doc_file = st.file_uploader("Upload a document (.txt)", type=["txt"])
+# Track the start time
+start_time = time.time()
+# Make predictions when a file is uploaded
+if doc_file is not None:
+    # Read the content of the uploaded .txt file
+    text_data = doc_file.read().decode("utf-8")
+    # Initialize an empty DataFrame for results
+    result_df = pd.DataFrame(columns=["Category", "Prediction", "Probability"])
+    # Progress bar
+    progress_bar = st.progress(0)
+    total_categories = len(model_directories)
+    for i, category in enumerate(tqdm(model_directories.keys(), desc="Scoring documents")):
+        # Load the pre-trained model for the current category
+        model, tokenizer = load_model(category)
+        # Skip the category if model loading fails
+        if model is not None:
+            # Score the document
+            prediction, probability = score_document(model, tokenizer, text_data)
+            # Create a DataFrame for the current result
+            new_row = pd.DataFrame({
+                "Category": [category],
+                "Prediction": [prediction],
+                "Probability": [probability]
+            })
+            # Use pd.concat to append the new row to the DataFrame
+            result_df = pd.concat([result_df, new_row], ignore_index=True)
+        # Update the progress bar
+        progress_bar.progress((i + 1) / total_categories)
+        # Estimate remaining time
+        elapsed_time = time.time() - start_time
+        estimated_total_time = (elapsed_time / (i + 1)) * total_categories
+        st.write(f"Elapsed time: {elapsed_time:.2f}s, Estimated time remaining: {estimated_total_time - elapsed_time:.2f}s")
+    # Save results to CSV
+    csv = result_df.to_csv(index=False).encode('utf-8')
+    st.download_button(
+        label="Download results as CSV",
+        data=csv,
+        file_name="document_scoring_results.csv",
+        mime="text/csv",
+    )
+    # Display completion message
+    st.success("Document scoring complete!")
+st.write("Note: Ensure the uploaded document is in .txt format containing text data. The current limit is 512 tokens and will be increased later.")