Spaces:

RamAI123
/

s4

Sleeping

File size: 8,558 Bytes

import torch
import numpy as np
import pandas as pd
from newsfetch.news import newspaper
from transformers import pipeline
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from newspaper import Article
from sklearn.preprocessing import LabelEncoder
import joblib
from datetime import datetime


# Example usage:

import streamlit as st

def main():
    st.title("URL and Text Input App")

    # Get URL input from the user
    url_input = st.text_input("Enter URL:", "")
    def scrape_news_content(url):
      try:
          news_article = newspaper(url)
          print("scraped: ",news_article)
          print("Attributes of the newspaper object:", dir(news_article))
          # Print the methods of the newspaper object
          print("Methods of the newspaper object:", [method for method in dir(news_article) if callable(getattr(news_article, method))])
          # Try to print some specific attributes
          print("Authors:", news_article.authors)
          return news_article.article
      except Exception as e:
          return "Error: " + str(e)

    def summarize_with_t5(article_content, classification, model, tokenizer, device):
        article_content = str(article_content)
        prompt = "Classification: " + str(classification) + "\n"
        if not article_content or article_content == "nan":
            return "", ""
        if classification == "risks":
            prompt = "summarize the key supply chain risks: "
        elif classification == "opportunities":
            prompt = "summarize the key supply chain opportunities: "
        elif classification == "neither":
            print("Nooo")
            return "None", "None"

        input_text = prompt + article_content
        input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)

        model = model.to(device)  #/ Move the model to the correct device
        summary_ids = model.generate(input_ids.to(device), max_length=150, num_beams=4, length_penalty=2.0, early_stopping=True)
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        print(summary)
        if classification in ["risks", "opportunities"]:
            st.write("This article is related to the supply chain.")
            if classification == "risks":
                return summary, "None"
            elif classification == "opportunities":
                return "None", summary
            else:
              return None,None
        else:
            st.write("This article is not classified as related to the supply chain.")

    def classify_and_summarize(input_text, cls_model, tokenizer_cls, label_encoder, model_summ, tokenizer_summ, device):
        if input_text.startswith("http"):
            # If the input starts with "http", assume it's a URL and extract content
            article_content = scrape_news_content(input_text)
            st.write("Entered URL:", url_input)
        else:
            # If the input is not a URL, assume it's the content
            article_content = input_text


          # Get the number of lines in the text.
        truncated_content = " ".join(article_content.split()[:150])
        st.markdown(f"Truncated Content:\n{truncated_content}", unsafe_allow_html=True)

        # Add a button to toggle between truncated and full content
        if st.button("Read More"):
            # Display the full content when the button is clicked
            full_content = " ".join(article_content.split())
            st.markdown(f"Full Content:\n{full_content}", unsafe_allow_html=True)
            # Remove the truncated content when the full content is displayed
            st.markdown(
                        """
                        <script>
                            document.getElementById("truncated-content").style.display = "none";
                        </script>
                        """,
                        unsafe_allow_html=True
                    )
        # Perform sentiment classification
        inputs_cls = tokenizer_cls(article_content, return_tensors="pt", max_length=512, truncation=True)
        inputs_cls = {key: value.to(device) for key, value in inputs_cls.items()}

        # Move cls_model to the specified device
        cls_model = cls_model.to(device)

        outputs_cls = cls_model(**inputs_cls)
        logits_cls = outputs_cls.logits
        predicted_class = torch.argmax(logits_cls, dim=1).item()
        print("predicted_class: ", predicted_class)
        classification = label_encoder.inverse_transform([predicted_class])[0]
        print("classification: ", classification)

        # Perform summarization based on the classification
        print("article_content:", article_content)
        summary_risk, summary_opportunity = summarize_with_t5(article_content, classification, model_summ, tokenizer_summ, device)
        if summary_risk is None:
            print("No risk summary generated.")
            summary_risk = "No risk summary available"  # Provide a default value or handle accordingly
        if summary_opportunity is None:
            print("No opportunity summary generated.")
            summary_opportunity = "No opportunity summary available"  # Provide a default value or handle accordingly

        return classification, summary_risk, summary_opportunity,article_content


    print(url_input)
    cls_model =AutoModelForSequenceClassification.from_pretrained("riskclassification_finetuned_xlnet_model_ld")
    print(type(cls_model))

    tokenizer_cls = AutoTokenizer.from_pretrained("xlnet-base-cased")
    label_encoder = LabelEncoder()

    # Assuming 'label_column values' is the column you want to encode
    label_column_values = ["risks","opportunities","neither"]

    # Extract the target column
    #y = data[label_column].values.reshape(-1, 1)

    label_encoder.fit_transform(label_column_values)

    print("Label encoder values")

    # Replace the original column with the encoded values
    label_encoder_path = "riskclassification_finetuned_xlnet_model_ld/encoder_labels.pkl"
    joblib.dump(label_encoder, label_encoder_path)

    model_summ = T5ForConditionalGeneration.from_pretrained("t5-small")
    tokenizer_summ = T5Tokenizer.from_pretrained("t5-small")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



    classification, summary_risk, summary_opportunity,article_content = classify_and_summarize(url_input, cls_model, tokenizer_cls, label_encoder, model_summ, tokenizer_summ, device)

    print("Classification:", classification)
    print("Risk Summary:", summary_risk)
    print("Opportunity Summary:", summary_opportunity)


    # Display the entered URL
    st.write("Classification:",classification)
    st.write("Risk Summary:",summary_risk)
    st.write("Opportunity Summary:",summary_opportunity)


    def process_question():
        # Use session_state to persist variables across sessions
        if 'qa_history' not in st.session_state:
            st.session_state.qa_history = []

        # Input box for user's question
        user_question_key = st.session_state.question_counter if 'question_counter' in st.session_state else 0
        user_question = st.text_input("Ask a question about the article content:", key=user_question_key)

        # Check if "Send" button is clicked
        send_button_key = f"send_button_{user_question_key}"
        if st.button("Send", key=send_button_key) and user_question:
            # Use a question-answering pipeline to generate a response
            model_name = "deepset/tinyroberta-squad2"
            nlp = pipeline('question-answering', model=model_name, tokenizer=model_name)
            QA_input = {'question': user_question, 'context': article_content}
            res = nlp(QA_input)

            # Display the user's question and the model's answer
            st.write(f"You asked: {user_question}")
            st.write("Model's Answer:", res["answer"])

            # Add the question and answer to the history
            st.session_state.qa_history.append((user_question, res["answer"]))

            # Clear the input box

        # Display the history
        st.write("Question-Answer History:")
        for q, a in st.session_state.qa_history:
            st.write(f"Q: {q}")
            st.write(f"A: {a}")

    # Run the function to process questions
    process_question()
if __name__ == "__main__":
    main()