Spaces:

marcelomoreno26
/

Whatsapp-Chat-Summarizer-and-Analysis

Runtime error

App Files Files Community

marcelomoreno26 commited on Apr 15, 2024

Commit

9d54e27

•

1 Parent(s): dd204e1

Upload 3 files

Browse files

Files changed (3) hide show

model_functions.py +91 -0
preprocessor.py +95 -0
requirements.txt +5 -0

model_functions.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import torch
+from transformers import (AutoModelForSequenceClassification, AutoModelForSeq2SeqLM,
+                          AutoConfig, AutoModelForTokenClassification,
+                          AutoTokenizer, pipeline)
+from peft import PeftModel, PeftConfig
+def load_sentiment_analyzer():
+    tokenizer = AutoTokenizer.from_pretrained("aliciiavs/sentiment-analysis-whatsapp2")
+    model = AutoModelForSequenceClassification.from_pretrained("aliciiavs/sentiment-analysis-whatsapp2")
+    return tokenizer, model
+def load_summarizer():
+    config = PeftConfig.from_pretrained("marcelomoreno26/bart-large-samsum-adapter")
+    model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large")
+    tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large")
+    tokenizer.pad_token = tokenizer.eos_token
+    model = PeftModel.from_pretrained(model, "marcelomoreno26/bart-large-samsum-adapter", config=config)
+    model = model.merge_and_unload()
+    return tokenizer, model
+def load_NER():
+    config = AutoConfig.from_pretrained("hannahisrael03/distilbert-base-uncased-finetuned-wikiann")
+    model = AutoModelForTokenClassification.from_pretrained("hannahisrael03/distilbert-base-uncased-finetuned-wikiann",config=config)
+    tokenizer = AutoTokenizer.from_pretrained("hannahisrael03/distilbert-base-uncased-finetuned-wikiann")
+    pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="average")
+    return pipe
+def get_sentiment_analysis(text, tokenizer, model):
+    inputs = tokenizer(text, padding=True, return_tensors="pt")
+    with torch.no_grad():
+        outputs = model(**inputs)
+    # Get predicted probabilities and predicted label
+    probabilities = torch.softmax(outputs.logits, dim=1)
+    predicted_label = torch.argmax(probabilities, dim=1)
+    # Convert the predicted label tensor to a Python integer
+    predicted_label = predicted_label.item()
+    # Map predicted label index to sentiment label
+    label_dic = {0: 'sadness', 1: 'joy', 2: 'love', 3: 'anger', 4: 'fear', 5: 'surprise'}
+    # Print the predicted sentiment label
+    return label_dic[predicted_label]
+def generate_summary(text, tokenizer, model):
+    prefix = "summarize: "
+    encoded_input = tokenizer.encode_plus(prefix + text, return_tensors='pt', add_special_tokens=True)
+    input_ids = encoded_input['input_ids']
+    # Check if input_ids exceed the model's max length
+    max_length = 512
+    if input_ids.shape[1] > max_length:
+        # Split the input_ids into manageable segments
+        total_summary = []
+        for i in range(0, input_ids.shape[1], max_length - 50):  # We use max_length - 50 to allow for some room for the model to generate context
+            segment_ids = input_ids[:, i:i + max_length]
+            output_ids = model.generate(segment_ids, max_length=150, num_beams=5, early_stopping=True)
+            segment_summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)
+            total_summary.append(segment_summary)
+        # Concatenate all segment summaries
+        summary = ' '.join(total_summary)
+    else:
+        # Process as usual
+        output_ids = model.generate(input_ids, max_length=150, num_beams=5, early_stopping=True)
+        summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)
+    return summary
+def get_NER(text, pipe):
+    # Use pipeline to predict NER
+    results = pipe(text)
+    # Filter duplicates while retaining the highest score for each entity type and word combination
+    unique_entities = {}
+    for ent in results:
+        key = (ent['entity_group'], ent['word'])
+        if key not in unique_entities or unique_entities[key]['score'] < ent['score']:
+            unique_entities[key] = ent
+    # Prepare the output, sorted by the start position to maintain the order they appear in the text
+    filtered_results = sorted(unique_entities.values(), key=lambda x: x['start'])
+    # Format the results for a table display
+    formatted_results = [[ent['word'], ent['entity_group']] for ent in filtered_results]
+    return formatted_results

preprocessor.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import pandas as pd
+import zipfile
+import re
+from io import BytesIO
+def detect_file_type(file_path):
+    type = file_path[-3:]
+    print(type)
+    if type in ["txt","zip"]:
+        return type
+    else:
+        return "unknown"
+def preprocess_whatsapp_messages(file_path, file_type):
+    """
+       Preprocesses the Whatsapp messages zip file into a Pandas Dataframe, all messages in one day go
+       to a row and a timestamp is added.
+       Args:
+           file_path (str): Location of the file (zip or txt) of the conversation.
+       Returns:
+           str: Dataframe
+       """
+    # Load the zip file and extract text data
+    print(file_type)
+    if file_type == "zip":
+        with zipfile.ZipFile(file_path, 'r') as z:
+            file_name = z.namelist()[0]
+            with z.open(file_name) as file:
+                text_data = file.read().decode('utf-8')
+    else:
+        text_data = BytesIO(file_path.getvalue()).read().decode('utf-8')
+    # Split the text data into lines
+    lines = text_data.strip().split('\n')
+    # Create a DataFrame
+    df = pd.DataFrame(lines, columns=['message'])
+    # Process each line to separate timestamp and text
+    df[['timestamp', 'text']] = df['message'].str.split(']', n=1, expand=True)
+    df['timestamp'] = df['timestamp'].str.strip('[')
+    # Handle cases where the split might not work (e.g., missing ']' in a line)
+    df.dropna(subset=['timestamp', 'text'], inplace=True)
+    # Convert timestamp to datetime and remove the time, keeping only the date
+    df['timestamp'] = pd.to_datetime(df['timestamp'], format='%d/%m/%y, %H:%M:%S', errors='coerce').dt.date
+    # Drop rows where the timestamp conversion failed (which results in NaT)
+    df.dropna(subset=['timestamp'], inplace=True)
+    # Remove initial WhatsApp system messages in English and Spanish
+    filter_text_en = "Your messages and calls are end-to-end encrypted"
+    filter_text_es = "Los mensajes y las llamadas están cifrados de extremo a extremo"
+    df = df[~df['text'].str.contains(filter_text_en, na=False)]
+    df = df[~df['text'].str.contains(filter_text_es, na=False)]
+    # Additional preprocessing steps:
+    # Remove URLs and convert text to lowercase
+    df['text'] = df['text'].apply(lambda x: re.sub(r'https?:\/\/\S+', '', x))  # Remove URLs
+    df['text'] = df['text'].apply(lambda x: x.lower())                        # Convert text to lowercase
+    # Remove emojis, images, stickers, documents while preserving colons after sender names
+    df['text'] = df['text'].apply(lambda x: re.sub(r'(?<!\w)(:\s|\s:\s|\s:)', '', x))  # Remove colons that are not part of sender's name
+    df['text'] = df['text'].apply(lambda x: re.sub(r'\[image omitted\]', '', x))  # Remove images
+    df['text'] = df['text'].apply(lambda x: re.sub(r'\[sticker omitted\]', '', x))  # Remove stickers
+    df['text'] = df['text'].apply(lambda x: re.sub(r'\[document omitted\]', '', x)) # Remove documents
+    df['text'] = df['text'].apply(lambda x: re.sub(r'<se editó este mensaje.>', '', x)) # Remove editing function (new Whatsapp addition) in Spanish
+    df['text'] = df['text'].apply(lambda x: re.sub(r'<this message was edited.>', '', x)) # Remove editing function (new Whatsapp addition) in English I AM GUESSING IDk
+    # Group by date and concatenate all messages from the same date
+    df = df.groupby('timestamp')['text'].apply(lambda x: '\n'.join(x)).reset_index()
+    df.columns = ['date', 'text']
+    df['date'] = pd.to_datetime(df['date'])
+    df['text'] = df['text'].astype(str)
+    return df
+def get_dated_input(data, selected_date):
+    '''
+    The Pandas dataframe is processed and the text is extracted.
+    :param data:
+    :param selected_date:
+    :return:
+    '''
+    selected_date = pd.to_datetime(selected_date)
+    data_for_model = data[data['date'].dt.date == selected_date.date()]
+    data_for_model.loc[:, 'text'] = data_for_model['text']
+    first_row_text = data_for_model['text'].iloc[0]
+    return first_row_text

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+torch==2.2.2
+pandas==2.2.2
+transformers==4.39.3
+streamlit==1.33.0
+git+https://github.com/huggingface/peft.git