File size: 8,558 Bytes
7b6fda9
 
 
 
 
 
 
4943759
7b6fda9
 
 
4943759
7b6fda9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4943759
 
 
 
 
7b6fda9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4943759
7b6fda9
 
 
 
4943759
7b6fda9
 
 
 
4943759
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7b6fda9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4943759
7b6fda9
 
 
eb08149
4943759
 
7b6fda9
 
 
 
 
 
 
 
 
 
 
 
 
 
202989b
7b6fda9
 
 
 
 
 
 
 
4943759
7b6fda9
 
 
 
 
 
 
 
 
 
 
4943759
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7b6fda9
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
import torch
import numpy as np
import pandas as pd
from newsfetch.news import newspaper
from transformers import pipeline
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from newspaper import Article
from sklearn.preprocessing import LabelEncoder
import joblib
from datetime import datetime


# Example usage:

import streamlit as st

def main():
    st.title("URL and Text Input App")

    # Get URL input from the user
    url_input = st.text_input("Enter URL:", "")
    def scrape_news_content(url):
      try:
          news_article = newspaper(url)
          print("scraped: ",news_article)
          print("Attributes of the newspaper object:", dir(news_article))
          # Print the methods of the newspaper object
          print("Methods of the newspaper object:", [method for method in dir(news_article) if callable(getattr(news_article, method))])
          # Try to print some specific attributes
          print("Authors:", news_article.authors)
          return news_article.article
      except Exception as e:
          return "Error: " + str(e)

    def summarize_with_t5(article_content, classification, model, tokenizer, device):
        article_content = str(article_content)
        prompt = "Classification: " + str(classification) + "\n"
        if not article_content or article_content == "nan":
            return "", ""
        if classification == "risks":
            prompt = "summarize the key supply chain risks: "
        elif classification == "opportunities":
            prompt = "summarize the key supply chain opportunities: "
        elif classification == "neither":
            print("Nooo")
            return "None", "None"

        input_text = prompt + article_content
        input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)

        model = model.to(device)  #/ Move the model to the correct device
        summary_ids = model.generate(input_ids.to(device), max_length=150, num_beams=4, length_penalty=2.0, early_stopping=True)
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        print(summary)
        if classification in ["risks", "opportunities"]:
            st.write("This article is related to the supply chain.")
            if classification == "risks":
                return summary, "None"
            elif classification == "opportunities":
                return "None", summary
            else:
              return None,None
        else:
            st.write("This article is not classified as related to the supply chain.")

    def classify_and_summarize(input_text, cls_model, tokenizer_cls, label_encoder, model_summ, tokenizer_summ, device):
        if input_text.startswith("http"):
            # If the input starts with "http", assume it's a URL and extract content
            article_content = scrape_news_content(input_text)
            st.write("Entered URL:", url_input)
        else:
            # If the input is not a URL, assume it's the content
            article_content = input_text


          # Get the number of lines in the text.
        truncated_content = " ".join(article_content.split()[:150])
        st.markdown(f"Truncated Content:\n{truncated_content}", unsafe_allow_html=True)

        # Add a button to toggle between truncated and full content
        if st.button("Read More"):
            # Display the full content when the button is clicked
            full_content = " ".join(article_content.split())
            st.markdown(f"Full Content:\n{full_content}", unsafe_allow_html=True)
            # Remove the truncated content when the full content is displayed
            st.markdown(
                        """
                        <script>
                            document.getElementById("truncated-content").style.display = "none";
                        </script>
                        """,
                        unsafe_allow_html=True
                    )
        # Perform sentiment classification
        inputs_cls = tokenizer_cls(article_content, return_tensors="pt", max_length=512, truncation=True)
        inputs_cls = {key: value.to(device) for key, value in inputs_cls.items()}

        # Move cls_model to the specified device
        cls_model = cls_model.to(device)

        outputs_cls = cls_model(**inputs_cls)
        logits_cls = outputs_cls.logits
        predicted_class = torch.argmax(logits_cls, dim=1).item()
        print("predicted_class: ", predicted_class)
        classification = label_encoder.inverse_transform([predicted_class])[0]
        print("classification: ", classification)

        # Perform summarization based on the classification
        print("article_content:", article_content)
        summary_risk, summary_opportunity = summarize_with_t5(article_content, classification, model_summ, tokenizer_summ, device)
        if summary_risk is None:
            print("No risk summary generated.")
            summary_risk = "No risk summary available"  # Provide a default value or handle accordingly
        if summary_opportunity is None:
            print("No opportunity summary generated.")
            summary_opportunity = "No opportunity summary available"  # Provide a default value or handle accordingly

        return classification, summary_risk, summary_opportunity,article_content


    print(url_input)
    cls_model =AutoModelForSequenceClassification.from_pretrained("riskclassification_finetuned_xlnet_model_ld")
    print(type(cls_model))

    tokenizer_cls = AutoTokenizer.from_pretrained("xlnet-base-cased")
    label_encoder = LabelEncoder()

    # Assuming 'label_column values' is the column you want to encode
    label_column_values = ["risks","opportunities","neither"]

    # Extract the target column
    #y = data[label_column].values.reshape(-1, 1)

    label_encoder.fit_transform(label_column_values)

    print("Label encoder values")

    # Replace the original column with the encoded values
    label_encoder_path = "riskclassification_finetuned_xlnet_model_ld/encoder_labels.pkl"
    joblib.dump(label_encoder, label_encoder_path)

    model_summ = T5ForConditionalGeneration.from_pretrained("t5-small")
    tokenizer_summ = T5Tokenizer.from_pretrained("t5-small")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



    classification, summary_risk, summary_opportunity,article_content = classify_and_summarize(url_input, cls_model, tokenizer_cls, label_encoder, model_summ, tokenizer_summ, device)

    print("Classification:", classification)
    print("Risk Summary:", summary_risk)
    print("Opportunity Summary:", summary_opportunity)


    # Display the entered URL
    st.write("Classification:",classification)
    st.write("Risk Summary:",summary_risk)
    st.write("Opportunity Summary:",summary_opportunity)


    def process_question():
        # Use session_state to persist variables across sessions
        if 'qa_history' not in st.session_state:
            st.session_state.qa_history = []

        # Input box for user's question
        user_question_key = st.session_state.question_counter if 'question_counter' in st.session_state else 0
        user_question = st.text_input("Ask a question about the article content:", key=user_question_key)

        # Check if "Send" button is clicked
        send_button_key = f"send_button_{user_question_key}"
        if st.button("Send", key=send_button_key) and user_question:
            # Use a question-answering pipeline to generate a response
            model_name = "deepset/tinyroberta-squad2"
            nlp = pipeline('question-answering', model=model_name, tokenizer=model_name)
            QA_input = {'question': user_question, 'context': article_content}
            res = nlp(QA_input)

            # Display the user's question and the model's answer
            st.write(f"You asked: {user_question}")
            st.write("Model's Answer:", res["answer"])

            # Add the question and answer to the history
            st.session_state.qa_history.append((user_question, res["answer"]))

            # Clear the input box

        # Display the history
        st.write("Question-Answer History:")
        for q, a in st.session_state.qa_history:
            st.write(f"Q: {q}")
            st.write(f"A: {a}")

    # Run the function to process questions
    process_question()
if __name__ == "__main__":
    main()