Spaces:

Vanofuture
/

political_campaign

Runtime error

App Files Files Community

unt2tled commited on Sep 13, 2022

Commit

86756d8

1 Parent(s): 6018cfd

init

Browse files

Files changed (20) hide show

.gitattributes +2 -31
.gitignore +0 -0
Demo.py +62 -0
LICENSE +21 -0
README.md +19 -13
analysis/linguistic_analysis.py +47 -0
analysis/words_decision_tree.py +56 -0
analysis/words_distributions.py +71 -0
analysis/words_distributions.xlsx +0 -0
model_loader.py +38 -0
requirements.txt +5 -0
tools/__init__.py +0 -0
tools/__pycache__/__init__.cpython-38.pyc +0 -0
tools/__pycache__/ocr_video.cpython-38.pyc +0 -0
tools/__pycache__/video_tools.cpython-38.pyc +0 -0
tools/facial_features.py +67 -0
tools/ocr_video.py +65 -0
tools/text_sentiment.py +64 -0
tools/text_summarization.py +118 -0
tools/video_tools.py +24 -0

.gitattributes CHANGED Viewed

@@ -1,31 +1,2 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text


1	+ # Auto detect text files and perform LF normalization
2	+ * text=auto

.gitignore ADDED Viewed

The diff for this file is too large to render. See raw diff

Demo.py ADDED Viewed

	@@ -0,0 +1,62 @@

+"""
+Demo UI page
+"""
+import streamlit as st
+#import tools.ocr_video as ocr
+import os
+import shutil
+import uuid
+from model_loader import HFPretrainedModel
+from transformers import pipeline
+import torch
+@st.cache(hash_funcs={"MyUnhashableClass": lambda _: None})
+def load_sentiment_model():
+    return pipeline("sentiment-analysis", model="siebert/sentiment-roberta-large-english")
+@st.cache(hash_funcs={"MyUnhashableClass": lambda _: None})
+def load_campaign_model():
+    return HFPretrainedModel("distilbert-base-uncased", "deano/political-campaign-analysis-110922")
+if "session_id" not in st.session_state:
+    st.session_state["session_id"] = uuid.uuid1()
+# Temporary folder path
+TMP_PATH = "tmp-{"+str(st.session_state["session_id"])+"}/"
+st.title("Demo page")
+st.markdown("""Upload the US political campaign video to predict its orientation (base/center).""")
+video_file = st.file_uploader("Choose the US political campaign video", type=["wmv", "avi", "mov"], disabled=True)
+text = st.text_input("Transcript of the video", "")
+b = st.button("Predict")
+if b:
+    st.markdown("""---""")
+    status_bar = st.progress(0)
+    upload_cap = st.caption("Uploading video...")
+    #if os.path.isdir(TMP_PATH):
+    #    shutil.rmtree(TMP_PATH)
+    #os.mkdir(TMP_PATH)
+    #with open(TMP_PATH+"uploaded_video_tmp", "wb") as f:
+    #    f.write(video_file.getbuffer())
+    status_bar.progress(50)
+    #upload_cap.caption("Extracting text from frames... (can take some time)")
+    #text_ocr = ocr.get_formated_text(ocr.retrieve_text(TMP_PATH+"uploaded_video_tmp", frames_path = "tmp_frames-{"+str(st.session_state["session_id"])+"}", show_print = False))
+    upload_cap.caption("Extracting text sentiment...")
+    sentiment_analysis = load_sentiment_model()
+    text_sentiment = sentiment_analysis(text)[0]["label"]
+    status_bar.progress(80)
+    #shutil.rmtree(TMP_PATH)
+    status_bar.progress(90)
+    upload_cap.caption("Prediction...")
+    model = load_campaign_model()
+    #query_dict = {"text": [text], "text_ocr": [text_ocr]}
+    query_dict = {"text": [text], "label_sentiment": [text_sentiment]}
+    # Predicted confidence for each label
+    conf = model.predict(query_dict)
+    col1, col2 = st.columns(2)
+    col1.metric("Base", "{:.2f}".format(conf[1].item()*100)+"%", "")
+    col2.metric("Center", "{:.2f}".format(conf[0].item()*100)+"%", "")
+    status_bar.progress(100)
+    upload_cap.caption("Done")

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2022 unt2tled
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,13 +1,19 @@
----
-title: Political Campaign
-emoji: 📊
-colorFrom: gray
-colorTo: red
-sdk: streamlit
-sdk_version: 1.10.0
-app_file: app.py
-pinned: false
-license: apache-2.0
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# Political Campaign Project
+Deep learning pipelines to predict the target of political messages.
+## About
+The goal of this project is to present machine learning approach of classification political campaign videos from the USA of different years by target audience (base/center). The classification is done by extracting different features from the video (e.g., speech-to-text, visual data) and training a neural network. More details can be found in the related [paper](https://drive.google.com/file/d/1-o9UVRRV7XRlGGBsYUfOkmch2ai-A2Fg/view?usp=sharing).
+## Navigation
+### Dataset
+Datasets, including extracted features, tagging files and political campaign videos to train on can be found [here](https://drive.google.com/drive/folders/1-7rkd_SozNGLrNHXnEZ0iTKqO9ztKhiU?usp=sharing).
+### Features extraction
+All the code used for features extraction is in the */tools* directory.
+### Analysis
+Code for model analysis is in the */analysis* directory.
+### Training model
+To train the model use [this](https://colab.research.google.com/drive/1ceVEWRAkIQJsOGuMxmG2qvPY3huZf8gc?usp=sharing) Google Colab notebook. [This](https://colab.research.google.com/drive/1MH19zWCCqQFTKidT5qq6pIPbmsdyuAIp?usp=sharing) notebook is used to make predictions from the pre-trained model.
+### Demo
+Example UI of a pre-trained model with test accuracy of ~80% using speech-to-text and text from video features can be found [here](https://unt2tled-political-campaign-project-demo-6gbfbd.streamlitapp.com/) or by cloning the repository and calling from the project's root:
+```
+pip install streamlit
+streamlit run Demo.py
+```

analysis/linguistic_analysis.py ADDED Viewed

	@@ -0,0 +1,47 @@

+"""
+This module contains methods for texts linguistic analysis
+"""
+import csv
+import re
+import matplotlib.pyplot as plt
+def count_avg_questions(path):
+    x = []
+    y = ([], [], [])
+    with open(path, "r") as tags_file:
+        csv_reader = csv.reader(tags_file)
+        next(csv_reader)
+        counter = [0, 0, 0]
+        counter_total = [0, 0, 0]
+        for i, row in enumerate(csv_reader):
+            x.append(i)
+            y[0].append(0)
+            y[1].append(0)
+            y[2].append(0)
+            text = row[1]
+            counter_total[int(row[2])] += 1
+            counter[int(row[2])] += len(re.findall("\?", text))
+            y[int(row[2])][-1] = len(re.findall("\?", text))
+    plt.plot(x, y[0])
+    #plt.plot(x, y[1])
+    plt.plot(x, y[2])
+    print(y[2])
+    plt.show()
+    return [(counter[i]/counter_total[i]) for i in range(len(counter))]
+def count_pronouns(path):
+    with open(path, "r") as tags_file:
+        csv_reader = csv.reader(tags_file)
+        next(csv_reader)
+        counter = [0, 0, 0]
+        counter_total = [0, 0, 0]
+        for row in csv_reader:
+            text = row[1]
+            counter_total[int(row[2])] += 1
+            #pattern = "(he)|(she)|(her)|(his)|(them)|(they)|(their)"
+            pattern = "(Obama)"
+            counter[int(row[2])] += len(re.findall(pattern, text, re.IGNORECASE))
+    return [(counter[i]/counter_total[i]) for i in range(len(counter))]
+print(count_avg_questions("tags.csv"))
+print(count_pronouns("tags.csv"))

analysis/words_decision_tree.py ADDED Viewed

	@@ -0,0 +1,56 @@

+"""
+This module containes methods for words classification using desicion trees
+"""
+from __future__ import print_function
+import os
+import subprocess
+import pandas as pd
+import numpy as np
+from sklearn.tree import DecisionTreeClassifier, plot_tree
+import graphviz
+# ref: http://chrisstrelioff.ws/sandbox/2015/06/08/decision_trees_in_python_with_scikit_learn_and_pandas.html
+input_file_path = 'text_words_labels.csv'
+def get_data(input_file_path):
+    df = pd.read_csv(input_file_path)
+    return df
+def encode_target(df, target_column):
+    """Add column to df with integers for the target.
+    Args
+    ----
+    df -- pandas DataFrame.
+    target_column -- column to map to int, producing
+                     new Target column.
+    Returns
+    -------
+    df_mod -- modified DataFrame.
+    targets -- list of target names.
+    """
+    df_mod = df.copy()
+    targets = df_mod[target_column].unique()
+    map_to_int = {name: n for n, name in enumerate(targets)}
+    df_mod["target"] = df_mod[target_column].replace(map_to_int)
+    return (df_mod, targets)
+df = get_data(input_file_path)
+df2, targets = encode_target(df, "target")
+print("* df2.head()", df2[["target", "name"]].head(),
+      sep="\n", end="\n\n")
+print("* df2.tail()", df2[["target", "name"]].tail(),
+      sep="\n", end="\n\n")
+print("* targets", targets, sep="\n", end="\n\n")
+features = [c for c in df2.columns.values if c != 'name' and c != 'isdefinite' and c != 'target']
+y = df2["target"]
+X = df2[features]
+dt = DecisionTreeClassifier(min_samples_split=20, random_state=99)
+dt.fit(X, y)
+plot_tree(dt,max_depth=3)

analysis/words_distributions.py ADDED Viewed

	@@ -0,0 +1,71 @@

+"""
+This module contains classes and methods for words distribution analysis
+"""
+import pandas as pd
+import numpy as np
+class WordsDistributionClass:
+    ''' This class is for creating a dataframe with the frequencies
+        of the words in the text column of the input file, in addition
+        to the file's original columns. '''
+    def __init__(self,input_file_path,output_file_path,text_column='text'):
+        self.input_file_path = input_file_path
+        self.output_file_path = output_file_path
+        self.text_column = text_column
+    def initialize_data(self):
+        # read dataframe from the input CSV file path
+        self.df = pd.read_csv(self.input_file_path,encoding='cp1255')
+        # add frequencies of the words in the text column as columns
+        # for the dataframe which was previously read
+        # Impl. Note: all_words is a dictionary for the words' frequencies
+        #             to be used during the calculation. It's a local variable.
+        #             for word in all_words.keys():
+        #             all_words[word] == # videos which contain word
+        #                                   as part of the text in in the text column
+        all_words = {}
+        self.df['freq'] = self.df.apply(lambda x:
+                WordsDistributionClass.get_words_freq_in_text(x[self.text_column],all_words),axis=1)
+        for word in all_words.keys():
+            if all_words[word] >= 10:
+                self.df['freq_'+word] = self.df.apply(lambda x:
+                    0 if word not in x['freq'].keys() else x['freq'][word],axis=1)
+        del all_words
+        del self.df['freq']
+    def get_words_freq_in_text(text,all_words):
+        # static public function
+        freq = {}
+        # our calcuation is not sensitive to CAPS-LOCK characters
+        text = text.lower()
+        # our calcuation is not sensitive to the characters: ";",",","."
+        # NOTE: we are sensitive to other characters, including question marks
+        # and '"', "'" etc.
+        text = text.replace(";","")
+        text = text.replace(",","")
+        text = text.replace(".","")
+        words = text.split(" ")
+        # algorithm for assigning words distribution
+        # for given all_words dictionary
+        for word in words:
+            if word not in all_words:
+                all_words[word] = 0
+            if word not in freq.keys():
+                freq[word] = 1
+                all_words[word] += 1
+            else:
+                freq[word] += 1
+        return freq
+    def save_output(self):
+        #export dataframe to output CSV file path
+        self.df.to_csv(self.output_file_path,index=False)
+if __name__ == "__main__":
+    # Arguments
+    INPUT_FILE_NAME = 'tagging_MMD_db_with_face_sentiment_extracted.csv'
+    OUTPUT_FILE_NAME = 'tagging_MMD_db_with_face_sentiment_extracted_and_words_distributions.csv'
+    # Run WordsDistributionClass on the given input
+    wdc = WordsDistributionClass(INPUT_FILE_NAME,OUTPUT_FILE_NAME)
+    wdc.initialize_data()
+    wdc.save_output()

analysis/words_distributions.xlsx ADDED Viewed

Binary file (826 kB). View file

model_loader.py ADDED Viewed

	@@ -0,0 +1,38 @@

+"""
+This module contains loaders for loading models to predict a political campaign orientation (base/center)
+"""
+from transformers import AutoTokenizer
+from datasets import Dataset
+from transformers import AutoModelForSequenceClassification, Trainer
+from datasets import load_metric
+import pandas as pd
+import numpy as np
+import torch
+from torch.nn.functional import softmax
+HF_TOKEN = "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"
+class HFPretrainedModel:
+    def __init__(self, lang_model_name: str,checkpoint:str):
+        self.lang_model_name = lang_model_name
+        self.checkpoint = checkpoint
+        self.init_tokenizer()
+        self.init_config()
+    @staticmethod
+    def compute_metrics(eval_pred):
+        logits, labels = eval_pred
+        metric = load_metric("accuracy")
+        predictions = np.argmax(logits, axis=-1)
+        return metric.compute(predictions=predictions, references=labels)
+    def init_tokenizer(self):
+        self.tokenizer = AutoTokenizer.from_pretrained(self.lang_model_name)
+    def init_config(self):
+        self.model = AutoModelForSequenceClassification.from_pretrained(self.checkpoint, use_auth_token=HF_TOKEN, num_labels=2)
+        self.trainer = Trainer(model=self.model,tokenizer=self.tokenizer,compute_metrics=HFPretrainedModel.compute_metrics)
+    def predict(self, data: dict):
+        # Build dataset with one row
+        data_to_predict = Dataset.from_dict(data)
+        tokenized_ds = data_to_predict.map(lambda examples: self.tokenizer([examples[text_feature] if examples[text_feature] is not None else '' for text_feature in data.keys()],is_split_into_words=True,truncation=True))
+        predictions = self.trainer.predict(tokenized_ds)
+        pred_tensor = torch.tensor(predictions.predictions[0])
+        return softmax(pred_tensor, dim=0)

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+transformers
+datasets
+numpy
+pandas
+torch

tools/__init__.py ADDED Viewed

File without changes

tools/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (171 Bytes). View file

tools/__pycache__/ocr_video.cpython-38.pyc ADDED Viewed

Binary file (2.32 kB). View file

tools/__pycache__/video_tools.cpython-38.pyc ADDED Viewed

Binary file (748 Bytes). View file

tools/facial_features.py ADDED Viewed

	@@ -0,0 +1,67 @@

+"""
+This module allows to extract facial deatures from videos
+"""
+import os
+import shutil
+from retinaface import RetinaFace
+from deepface import DeepFace
+import json
+from video_tools import generate_frames
+FRAMES_PATH = "tmp_frames_faces"
+def retrieve_faces_data(video_path, rate = 50, show_print = True):
+    faces_lst = []
+    generate_frames(video_path, FRAMES_PATH, rate = rate, show_print = show_print)
+    for i in sorted([int(s[:-4]) for s in os.listdir(FRAMES_PATH)]):
+      faces = RetinaFace.extract_faces(FRAMES_PATH + "/" + str(i) + ".png")
+      data_lst = []
+      for face in faces:
+        try:
+          face_dict = DeepFace.analyze(face, actions = ["emotion"], detector_backend = "skip")
+          data_lst.append(face_dict["emotion"])
+        except ValueError:
+          # Face was not detected
+          continue
+      faces_lst.append(data_lst)
+    # Delete temporary directory
+    #shutil.rmtree(FRAMES_PATH)
+    return faces_lst
+def retrieve_to_file(dest, video_path):
+  face_data = retrieve_faces_data(video_path, show_print = False)
+  with open(dest, "w") as output_file:
+        output_file.writelines([json.dumps(item) + "\n" for item in face_data])
+def retrieve_to_files(dest, video_path):
+  for file_name in os.listdir(video_path):
+      retrieve_to_file(dest + "/" + os.path.splitext(file_name)[0] + "_data", video_path + "/" + file_name)
+def restore_from_file(file_path):
+  restored_lst = []
+  with open(file_path, "r") as file:
+    for line in file.readlines():
+      if line != "":
+        restored_lst.append(eval(line))
+  return restored_lst
+def data_to_vector(data):
+    vec = []
+    for frame in data:
+        avg = [0, 0, 0, 0, 0, 0, 0]
+        for face in frame:
+            avg[0] += face["angry"]
+            avg[1] += face["disgust"]
+            avg[2] += face["fear"]
+            avg[3] += face["happy"]
+            avg[4] += face["sad"]
+            avg[5] += face["surprise"]
+            avg[6] += face["neutral"]
+        if len(frame) != 0:
+            for i in range(7):
+                avg[i] /= len(frame)
+            vec.append(avg)
+    return vec
+if __name__ == "__main__":
+    retrieve_to_files("x", "result")

tools/ocr_video.py ADDED Viewed

	@@ -0,0 +1,65 @@

+"""
+This module allows to extract texts from videos using OCR
+"""
+import easyocr
+import os
+import cv2
+import shutil
+import difflib
+import re
+from tools.video_tools import generate_frames
+CONF_THRESH = 0.9
+SIMILARITY_THRESH = 0.8
+def process_text(text):
+    result = re.sub(r"[\n\"\[\]~;]", "", text)
+    lst = result.split()
+    s = ""
+    for item in lst:
+        item = item.strip()
+        if len(item)!=1 or item == "a" or item == "I" or item == "i" or item == "A":
+            s += " "+item
+    if len(s)<6:
+        s = ""
+    return s
+def get_formated_text(texts_arr):
+    res = ""
+    for row in texts_arr:
+        k = process_text(row.lower())
+        if len(k) > 0:
+            res += process_text(row.lower()) + ", "
+    return res[:-2]
+def add_text(text_lst, text):
+    for t in text_lst:
+      similarity = difflib.SequenceMatcher(None, t, text).ratio()
+      if similarity > SIMILARITY_THRESH:
+          return
+    text_lst.append(text)
+def retrieve_text(video_path, rate = 5, frames_path = "tmp_frames", show_print = True):
+    texts_lst = []
+    generate_frames(video_path, frames_path, rate = rate, show_print = show_print)
+    ocr = easyocr.Reader(['en'])
+    for i in os.listdir(frames_path):
+        text = ocr.readtext(frames_path + "/" + i)
+        for txt in text:
+          # Threshold for confidence
+          if txt[2] > CONF_THRESH:
+            # Filter similar texts
+            add_text(texts_lst, txt[1])
+    # Delete temporary directory
+    shutil.rmtree(frames_path)
+    return texts_lst
+def retrieve_to_file(dest, video_path):
+  text_lst = retrieve_text(video_path, rate = 2, show_print = False)
+  file = open(dest, "w")
+  file.writelines([line + "\n" for line in text_lst])
+  file.close()
+def retrieve_to_files(dest, video_path):
+  for file_name in os.listdir(video_path):
+      retrieve_to_file(dest + "/" + os.path.splitext(file_name)[0] + "_text.txt", video_path + "/" + file_name)

tools/text_sentiment.py ADDED Viewed

	@@ -0,0 +1,64 @@

+"""
+This module contains methods for extracting text sentiment from texts
+"""
+import torch
+import pandas as pd
+import numpy as np
+from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer
+# ref: https://colab.research.google.com/github/chrsiebert/sentiment-roberta-large-english/blob/main/sentiment_roberta_prediction_example.ipynb
+# Create class for data preparation
+class SimpleDataset:
+    def __init__(self, tokenized_texts):
+        self.tokenized_texts = tokenized_texts
+    def __len__(self):
+        return len(self.tokenized_texts["input_ids"])
+    def __getitem__(self, idx):
+        return {k: v[idx] for k, v in self.tokenized_texts.items()}
+class Sentiment_Extractor:
+    def __init__(self,input_file_name,text_column,output_file_name):
+        self.input_file_name = input_file_name
+        self.text_column = text_column
+        self.output_file_name = output_file_name
+    def run(self):
+        # Load tokenizer and model, create trainer
+        model_name = "siebert/sentiment-roberta-large-english"
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        model = AutoModelForSequenceClassification.from_pretrained(model_name)
+        trainer = Trainer(model=model)
+        df_pred = pd.read_csv(self.input_file_name,encoding='cp1255')
+        pred_texts = df_pred[self.text_column].dropna().astype('str').tolist()
+        # Tokenize texts and create prediction data set
+        tokenized_texts = tokenizer(pred_texts,truncation=True,padding=True)
+        pred_dataset = SimpleDataset(tokenized_texts)
+        # Run predictions
+        predictions = trainer.predict(pred_dataset)
+        # Transform predictions to labels
+        preds = predictions.predictions.argmax(-1)
+        labels = pd.Series(preds).map(model.config.id2label)
+        scores = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True)).max(1)
+        # Create DataFrame with texts, predictions, labels, and scores
+        df = pd.DataFrame(list(zip(pred_texts,preds,labels,scores)), columns=['text_sentiment','pred_sentiment','label_sentiment','score_sentiment'])
+        df_output = df_pred.merge(df,left_on=self.text_column,right_on='text_sentiment')
+        del df_output['text_sentiment']
+        df_output.to_csv(self.output_file_name,encoding='cp1255',index=False)
+if __name__ == "__main__":
+    # Arguments
+    # INPUT_FILE_NAME is the name of the input file
+    INPUT_FILE_NAME = "tagging_MMD_db_with_summarized.csv"
+    # TEXT_COLUMN is the name of the text column in the input file
+    # from which we extract the positive / negative sentiment by the 🤗 model.
+    TEXT_COLUMN = "text"
+    OUTPUT_FILE_NAME = 'tagging_MMD_db_with_sentiment.csv'
+    # Run Sentiment_Extractor on the given arguments
+    obj = Sentiment_Extractor(INPUT_FILE_NAME,OUTPUT_FILE_NAME)
+    obj.run()

tools/text_summarization.py ADDED Viewed

	@@ -0,0 +1,118 @@

+"""
+This module is for text summarization
+"""
+# ref: https://towardsdatascience.com/understand-text-summarization-and-create-your-own-summarizer-in-python-b26a9f09fc70
+import nltk
+nltk.download('stopwords')
+from nltk.corpus import stopwords
+from nltk.cluster.util import cosine_distance
+import pandas as pd
+import numpy as np
+import networkx as nx
+class SummarizationClass:
+    def read_text(text):
+        text = text.replace("\"","")
+        article = text.split(". ")
+        sentences = []
+        for sentence in article:
+            #print(sentence)
+            sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" "))
+        #sentences.pop()
+        return sentences
+    def sentence_similarity(sent1, sent2, stopwords=None):
+        if stopwords is None:
+            stopwords = []
+        sent1 = [w.lower() for w in sent1]
+        sent2 = [w.lower() for w in sent2]
+        all_words = list(set(sent1 + sent2))
+        vector1 = [0] * len(all_words)
+        vector2 = [0] * len(all_words)
+        # build the vector for the first sentence
+        for w in sent1:
+            if w in stopwords:
+                continue
+            vector1[all_words.index(w)] += 1
+        # build the vector for the second sentence
+        for w in sent2:
+            if w in stopwords:
+                continue
+            vector2[all_words.index(w)] += 1
+        return 1 - cosine_distance(vector1, vector2)
+    def build_similarity_matrix(sentences, stop_words):
+        # Create an empty similarity matrix
+        similarity_matrix = np.zeros((len(sentences), len(sentences)))
+        for idx1 in range(len(sentences)):
+            for idx2 in range(len(sentences)):
+                if idx1 == idx2: #ignore if both are same sentences
+                    continue
+                similarity_matrix[idx1][idx2] = SummarizationClass.sentence_similarity(sentences[idx1], sentences[idx2], stop_words)
+        return similarity_matrix
+    def generate_summary(text, top_n=5):
+        stop_words = stopwords.words('english')
+        summarize_text = []
+        # Step 1 - Read text anc split it
+        sentences =  SummarizationClass.read_text(text)
+        # Step 2 - Generate Similary Martix across sentences
+        sentence_similarity_martix = SummarizationClass.build_similarity_matrix(sentences, stop_words)
+        # Step 3 - Rank sentences in similarity martix
+        sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_martix)
+        #print(sentence_similarity_graph)
+        try:
+            scores = nx.pagerank(sentence_similarity_graph)
+            # Step 4 - Sort the rank and pick top sentences
+            ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
+            for i in range(top_n):
+                summarize_text.append(" ".join(ranked_sentence[i][1]))
+        except nx.exception.PowerIterationFailedConvergence:
+            print(f'text={text} was bad for nx')
+            return ''
+        # Step 5 - Offcourse, output the summarize texr
+        return ". ".join(summarize_text)
+class SummarizationClassRun:
+    ''' class for running the summarization class algorithm with given parameters '''
+    def __init__(self,input_file_path,text_column,output_file_path_keep_original_text_column):
+        self.input_file_path = input_file_path
+        self.text_column = text_column
+        self.output_file_path_keep_original_text_column = output_file_path_keep_original_text_column
+        self.output_file_path_override_text_column = output_file_path_override_text_column
+    def run(self):
+        # read input file as a dataframe
+        df = pd.read_csv(self.input_file_path,encoding='cp1255')
+        # add column with summarization of the text in the text column
+        df['summarized_text'] = df[self.text_column].apply(lambda x: SummarizationClass.generate_summary(x, 1))
+        # export output with the original text column to CSV file
+        df.to_csv(self.output_file_path_keep_original_text_column,encoding='cp1255',index=False)
+        # override original text column
+        df[self.text_column] = df['summarized_text']
+        del df['summarized_text']
+        # export output with the overridden text column to CSV file
+        df.to_csv(self.output_file_path_override_text_column,encoding='cp1255',index=False)
+if __name__ == '__main__':
+    # Arguments
+    INPUT_FILE_PATH = 'tagging_MMD_db.csv'
+    TEXT_COLUMN = 'text'
+    OUTPUT_FILE_PATH_KEEP_ORIGINAL_TEXT_COLUMN = 'tagging_MMD_db_with_summarized.csv'
+    OUTPUT_FILE_PATH_OVERRIDE_TEXT_COLUMN = 'summarized_tagging_MMD_db.csv'
+    obj = SummarizationClassRun(INPUT_FILE_PATH,TEXT_COLUMN,OUTPUT_FILE_PATH_KEEP_ORIGINAL_TEXT_COLUMN,OUTPUT_FILE_PATH_OVERRIDE_TEXT_COLUMN)
+    obj.run()

tools/video_tools.py ADDED Viewed

	@@ -0,0 +1,24 @@

+"""
+This module contains methods for video processing
+"""
+import os
+import cv2
+def generate_frames(video_path, frames_path, rate, show_print = True):
+    # Create a new temporary folder
+    if not os.path.exists(frames_path):
+        os.makedirs(frames_path)
+    # Capture video
+    src_vid = cv2.VideoCapture(video_path)
+    index = 0
+    while src_vid.isOpened():
+        ret, frame = src_vid.read()
+        if not ret:
+            break
+        name = frames_path + "/" + str(index) + ".png"
+        if index % rate == 0:
+            if show_print:
+                print("Frame: " + name)
+            cv2.imwrite(name, frame)
+        index = index + 1
+    src_vid.release()