Spaces:
Runtime error
Runtime error
unt2tled
commited on
Commit
•
86756d8
1
Parent(s):
6018cfd
init
Browse files- .gitattributes +2 -31
- .gitignore +0 -0
- Demo.py +62 -0
- LICENSE +21 -0
- README.md +19 -13
- analysis/linguistic_analysis.py +47 -0
- analysis/words_decision_tree.py +56 -0
- analysis/words_distributions.py +71 -0
- analysis/words_distributions.xlsx +0 -0
- model_loader.py +38 -0
- requirements.txt +5 -0
- tools/__init__.py +0 -0
- tools/__pycache__/__init__.cpython-38.pyc +0 -0
- tools/__pycache__/ocr_video.cpython-38.pyc +0 -0
- tools/__pycache__/video_tools.cpython-38.pyc +0 -0
- tools/facial_features.py +67 -0
- tools/ocr_video.py +65 -0
- tools/text_sentiment.py +64 -0
- tools/text_summarization.py +118 -0
- tools/video_tools.py +24 -0
.gitattributes
CHANGED
@@ -1,31 +1,2 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
6 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
7 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
8 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
9 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
10 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
11 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
12 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
13 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
14 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
15 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
16 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
17 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
18 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
21 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
22 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
23 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
24 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
25 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
26 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
27 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
28 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
29 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
30 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
31 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
1 |
+
# Auto detect text files and perform LF normalization
|
2 |
+
* text=auto
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.gitignore
ADDED
The diff for this file is too large to render.
See raw diff
|
|
Demo.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Demo UI page
|
3 |
+
"""
|
4 |
+
import streamlit as st
|
5 |
+
#import tools.ocr_video as ocr
|
6 |
+
import os
|
7 |
+
import shutil
|
8 |
+
import uuid
|
9 |
+
from model_loader import HFPretrainedModel
|
10 |
+
from transformers import pipeline
|
11 |
+
import torch
|
12 |
+
|
13 |
+
@st.cache(hash_funcs={"MyUnhashableClass": lambda _: None})
|
14 |
+
def load_sentiment_model():
|
15 |
+
return pipeline("sentiment-analysis", model="siebert/sentiment-roberta-large-english")
|
16 |
+
|
17 |
+
@st.cache(hash_funcs={"MyUnhashableClass": lambda _: None})
|
18 |
+
def load_campaign_model():
|
19 |
+
return HFPretrainedModel("distilbert-base-uncased", "deano/political-campaign-analysis-110922")
|
20 |
+
|
21 |
+
if "session_id" not in st.session_state:
|
22 |
+
st.session_state["session_id"] = uuid.uuid1()
|
23 |
+
|
24 |
+
# Temporary folder path
|
25 |
+
TMP_PATH = "tmp-{"+str(st.session_state["session_id"])+"}/"
|
26 |
+
|
27 |
+
st.title("Demo page")
|
28 |
+
st.markdown("""Upload the US political campaign video to predict its orientation (base/center).""")
|
29 |
+
video_file = st.file_uploader("Choose the US political campaign video", type=["wmv", "avi", "mov"], disabled=True)
|
30 |
+
text = st.text_input("Transcript of the video", "")
|
31 |
+
b = st.button("Predict")
|
32 |
+
if b:
|
33 |
+
st.markdown("""---""")
|
34 |
+
status_bar = st.progress(0)
|
35 |
+
upload_cap = st.caption("Uploading video...")
|
36 |
+
#if os.path.isdir(TMP_PATH):
|
37 |
+
# shutil.rmtree(TMP_PATH)
|
38 |
+
#os.mkdir(TMP_PATH)
|
39 |
+
#with open(TMP_PATH+"uploaded_video_tmp", "wb") as f:
|
40 |
+
# f.write(video_file.getbuffer())
|
41 |
+
status_bar.progress(50)
|
42 |
+
#upload_cap.caption("Extracting text from frames... (can take some time)")
|
43 |
+
#text_ocr = ocr.get_formated_text(ocr.retrieve_text(TMP_PATH+"uploaded_video_tmp", frames_path = "tmp_frames-{"+str(st.session_state["session_id"])+"}", show_print = False))
|
44 |
+
upload_cap.caption("Extracting text sentiment...")
|
45 |
+
sentiment_analysis = load_sentiment_model()
|
46 |
+
text_sentiment = sentiment_analysis(text)[0]["label"]
|
47 |
+
status_bar.progress(80)
|
48 |
+
|
49 |
+
#shutil.rmtree(TMP_PATH)
|
50 |
+
status_bar.progress(90)
|
51 |
+
upload_cap.caption("Prediction...")
|
52 |
+
model = load_campaign_model()
|
53 |
+
#query_dict = {"text": [text], "text_ocr": [text_ocr]}
|
54 |
+
query_dict = {"text": [text], "label_sentiment": [text_sentiment]}
|
55 |
+
# Predicted confidence for each label
|
56 |
+
conf = model.predict(query_dict)
|
57 |
+
col1, col2 = st.columns(2)
|
58 |
+
col1.metric("Base", "{:.2f}".format(conf[1].item()*100)+"%", "")
|
59 |
+
col2.metric("Center", "{:.2f}".format(conf[0].item()*100)+"%", "")
|
60 |
+
|
61 |
+
status_bar.progress(100)
|
62 |
+
upload_cap.caption("Done")
|
LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2022 unt2tled
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
README.md
CHANGED
@@ -1,13 +1,19 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Political Campaign Project
|
2 |
+
Deep learning pipelines to predict the target of political messages.
|
3 |
+
## About
|
4 |
+
The goal of this project is to present machine learning approach of classification political campaign videos from the USA of different years by target audience (base/center). The classification is done by extracting different features from the video (e.g., speech-to-text, visual data) and training a neural network. More details can be found in the related [paper](https://drive.google.com/file/d/1-o9UVRRV7XRlGGBsYUfOkmch2ai-A2Fg/view?usp=sharing).
|
5 |
+
## Navigation
|
6 |
+
### Dataset
|
7 |
+
Datasets, including extracted features, tagging files and political campaign videos to train on can be found [here](https://drive.google.com/drive/folders/1-7rkd_SozNGLrNHXnEZ0iTKqO9ztKhiU?usp=sharing).
|
8 |
+
### Features extraction
|
9 |
+
All the code used for features extraction is in the */tools* directory.
|
10 |
+
### Analysis
|
11 |
+
Code for model analysis is in the */analysis* directory.
|
12 |
+
### Training model
|
13 |
+
To train the model use [this](https://colab.research.google.com/drive/1ceVEWRAkIQJsOGuMxmG2qvPY3huZf8gc?usp=sharing) Google Colab notebook. [This](https://colab.research.google.com/drive/1MH19zWCCqQFTKidT5qq6pIPbmsdyuAIp?usp=sharing) notebook is used to make predictions from the pre-trained model.
|
14 |
+
### Demo
|
15 |
+
Example UI of a pre-trained model with test accuracy of ~80% using speech-to-text and text from video features can be found [here](https://unt2tled-political-campaign-project-demo-6gbfbd.streamlitapp.com/) or by cloning the repository and calling from the project's root:
|
16 |
+
```
|
17 |
+
pip install streamlit
|
18 |
+
streamlit run Demo.py
|
19 |
+
```
|
analysis/linguistic_analysis.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
This module contains methods for texts linguistic analysis
|
3 |
+
"""
|
4 |
+
import csv
|
5 |
+
import re
|
6 |
+
import matplotlib.pyplot as plt
|
7 |
+
|
8 |
+
def count_avg_questions(path):
|
9 |
+
x = []
|
10 |
+
y = ([], [], [])
|
11 |
+
with open(path, "r") as tags_file:
|
12 |
+
csv_reader = csv.reader(tags_file)
|
13 |
+
next(csv_reader)
|
14 |
+
counter = [0, 0, 0]
|
15 |
+
counter_total = [0, 0, 0]
|
16 |
+
for i, row in enumerate(csv_reader):
|
17 |
+
x.append(i)
|
18 |
+
y[0].append(0)
|
19 |
+
y[1].append(0)
|
20 |
+
y[2].append(0)
|
21 |
+
text = row[1]
|
22 |
+
counter_total[int(row[2])] += 1
|
23 |
+
counter[int(row[2])] += len(re.findall("\?", text))
|
24 |
+
y[int(row[2])][-1] = len(re.findall("\?", text))
|
25 |
+
plt.plot(x, y[0])
|
26 |
+
#plt.plot(x, y[1])
|
27 |
+
plt.plot(x, y[2])
|
28 |
+
print(y[2])
|
29 |
+
plt.show()
|
30 |
+
return [(counter[i]/counter_total[i]) for i in range(len(counter))]
|
31 |
+
|
32 |
+
def count_pronouns(path):
|
33 |
+
with open(path, "r") as tags_file:
|
34 |
+
csv_reader = csv.reader(tags_file)
|
35 |
+
next(csv_reader)
|
36 |
+
counter = [0, 0, 0]
|
37 |
+
counter_total = [0, 0, 0]
|
38 |
+
for row in csv_reader:
|
39 |
+
text = row[1]
|
40 |
+
counter_total[int(row[2])] += 1
|
41 |
+
#pattern = "(he)|(she)|(her)|(his)|(them)|(they)|(their)"
|
42 |
+
pattern = "(Obama)"
|
43 |
+
counter[int(row[2])] += len(re.findall(pattern, text, re.IGNORECASE))
|
44 |
+
return [(counter[i]/counter_total[i]) for i in range(len(counter))]
|
45 |
+
|
46 |
+
print(count_avg_questions("tags.csv"))
|
47 |
+
print(count_pronouns("tags.csv"))
|
analysis/words_decision_tree.py
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
This module containes methods for words classification using desicion trees
|
3 |
+
"""
|
4 |
+
from __future__ import print_function
|
5 |
+
import os
|
6 |
+
import subprocess
|
7 |
+
import pandas as pd
|
8 |
+
import numpy as np
|
9 |
+
from sklearn.tree import DecisionTreeClassifier, plot_tree
|
10 |
+
import graphviz
|
11 |
+
|
12 |
+
# ref: http://chrisstrelioff.ws/sandbox/2015/06/08/decision_trees_in_python_with_scikit_learn_and_pandas.html
|
13 |
+
|
14 |
+
input_file_path = 'text_words_labels.csv'
|
15 |
+
|
16 |
+
def get_data(input_file_path):
|
17 |
+
df = pd.read_csv(input_file_path)
|
18 |
+
return df
|
19 |
+
|
20 |
+
def encode_target(df, target_column):
|
21 |
+
"""Add column to df with integers for the target.
|
22 |
+
|
23 |
+
Args
|
24 |
+
----
|
25 |
+
df -- pandas DataFrame.
|
26 |
+
target_column -- column to map to int, producing
|
27 |
+
new Target column.
|
28 |
+
|
29 |
+
Returns
|
30 |
+
-------
|
31 |
+
df_mod -- modified DataFrame.
|
32 |
+
targets -- list of target names.
|
33 |
+
"""
|
34 |
+
df_mod = df.copy()
|
35 |
+
targets = df_mod[target_column].unique()
|
36 |
+
map_to_int = {name: n for n, name in enumerate(targets)}
|
37 |
+
df_mod["target"] = df_mod[target_column].replace(map_to_int)
|
38 |
+
|
39 |
+
return (df_mod, targets)
|
40 |
+
|
41 |
+
df = get_data(input_file_path)
|
42 |
+
df2, targets = encode_target(df, "target")
|
43 |
+
print("* df2.head()", df2[["target", "name"]].head(),
|
44 |
+
sep="\n", end="\n\n")
|
45 |
+
print("* df2.tail()", df2[["target", "name"]].tail(),
|
46 |
+
sep="\n", end="\n\n")
|
47 |
+
print("* targets", targets, sep="\n", end="\n\n")
|
48 |
+
|
49 |
+
features = [c for c in df2.columns.values if c != 'name' and c != 'isdefinite' and c != 'target']
|
50 |
+
|
51 |
+
y = df2["target"]
|
52 |
+
X = df2[features]
|
53 |
+
dt = DecisionTreeClassifier(min_samples_split=20, random_state=99)
|
54 |
+
dt.fit(X, y)
|
55 |
+
|
56 |
+
plot_tree(dt,max_depth=3)
|
analysis/words_distributions.py
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
This module contains classes and methods for words distribution analysis
|
3 |
+
"""
|
4 |
+
import pandas as pd
|
5 |
+
import numpy as np
|
6 |
+
|
7 |
+
class WordsDistributionClass:
|
8 |
+
''' This class is for creating a dataframe with the frequencies
|
9 |
+
of the words in the text column of the input file, in addition
|
10 |
+
to the file's original columns. '''
|
11 |
+
def __init__(self,input_file_path,output_file_path,text_column='text'):
|
12 |
+
self.input_file_path = input_file_path
|
13 |
+
self.output_file_path = output_file_path
|
14 |
+
self.text_column = text_column
|
15 |
+
def initialize_data(self):
|
16 |
+
# read dataframe from the input CSV file path
|
17 |
+
self.df = pd.read_csv(self.input_file_path,encoding='cp1255')
|
18 |
+
# add frequencies of the words in the text column as columns
|
19 |
+
# for the dataframe which was previously read
|
20 |
+
# Impl. Note: all_words is a dictionary for the words' frequencies
|
21 |
+
# to be used during the calculation. It's a local variable.
|
22 |
+
# for word in all_words.keys():
|
23 |
+
# all_words[word] == # videos which contain word
|
24 |
+
# as part of the text in in the text column
|
25 |
+
all_words = {}
|
26 |
+
self.df['freq'] = self.df.apply(lambda x:
|
27 |
+
WordsDistributionClass.get_words_freq_in_text(x[self.text_column],all_words),axis=1)
|
28 |
+
for word in all_words.keys():
|
29 |
+
if all_words[word] >= 10:
|
30 |
+
self.df['freq_'+word] = self.df.apply(lambda x:
|
31 |
+
0 if word not in x['freq'].keys() else x['freq'][word],axis=1)
|
32 |
+
del all_words
|
33 |
+
del self.df['freq']
|
34 |
+
|
35 |
+
def get_words_freq_in_text(text,all_words):
|
36 |
+
# static public function
|
37 |
+
freq = {}
|
38 |
+
# our calcuation is not sensitive to CAPS-LOCK characters
|
39 |
+
text = text.lower()
|
40 |
+
# our calcuation is not sensitive to the characters: ";",",","."
|
41 |
+
# NOTE: we are sensitive to other characters, including question marks
|
42 |
+
# and '"', "'" etc.
|
43 |
+
text = text.replace(";","")
|
44 |
+
text = text.replace(",","")
|
45 |
+
text = text.replace(".","")
|
46 |
+
words = text.split(" ")
|
47 |
+
# algorithm for assigning words distribution
|
48 |
+
# for given all_words dictionary
|
49 |
+
for word in words:
|
50 |
+
if word not in all_words:
|
51 |
+
all_words[word] = 0
|
52 |
+
if word not in freq.keys():
|
53 |
+
freq[word] = 1
|
54 |
+
all_words[word] += 1
|
55 |
+
else:
|
56 |
+
freq[word] += 1
|
57 |
+
return freq
|
58 |
+
|
59 |
+
def save_output(self):
|
60 |
+
#export dataframe to output CSV file path
|
61 |
+
self.df.to_csv(self.output_file_path,index=False)
|
62 |
+
|
63 |
+
if __name__ == "__main__":
|
64 |
+
# Arguments
|
65 |
+
INPUT_FILE_NAME = 'tagging_MMD_db_with_face_sentiment_extracted.csv'
|
66 |
+
OUTPUT_FILE_NAME = 'tagging_MMD_db_with_face_sentiment_extracted_and_words_distributions.csv'
|
67 |
+
|
68 |
+
# Run WordsDistributionClass on the given input
|
69 |
+
wdc = WordsDistributionClass(INPUT_FILE_NAME,OUTPUT_FILE_NAME)
|
70 |
+
wdc.initialize_data()
|
71 |
+
wdc.save_output()
|
analysis/words_distributions.xlsx
ADDED
Binary file (826 kB). View file
|
|
model_loader.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
This module contains loaders for loading models to predict a political campaign orientation (base/center)
|
3 |
+
"""
|
4 |
+
from transformers import AutoTokenizer
|
5 |
+
from datasets import Dataset
|
6 |
+
from transformers import AutoModelForSequenceClassification, Trainer
|
7 |
+
from datasets import load_metric
|
8 |
+
import pandas as pd
|
9 |
+
import numpy as np
|
10 |
+
import torch
|
11 |
+
from torch.nn.functional import softmax
|
12 |
+
|
13 |
+
HF_TOKEN = "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"
|
14 |
+
|
15 |
+
class HFPretrainedModel:
|
16 |
+
def __init__(self, lang_model_name: str,checkpoint:str):
|
17 |
+
self.lang_model_name = lang_model_name
|
18 |
+
self.checkpoint = checkpoint
|
19 |
+
self.init_tokenizer()
|
20 |
+
self.init_config()
|
21 |
+
@staticmethod
|
22 |
+
def compute_metrics(eval_pred):
|
23 |
+
logits, labels = eval_pred
|
24 |
+
metric = load_metric("accuracy")
|
25 |
+
predictions = np.argmax(logits, axis=-1)
|
26 |
+
return metric.compute(predictions=predictions, references=labels)
|
27 |
+
def init_tokenizer(self):
|
28 |
+
self.tokenizer = AutoTokenizer.from_pretrained(self.lang_model_name)
|
29 |
+
def init_config(self):
|
30 |
+
self.model = AutoModelForSequenceClassification.from_pretrained(self.checkpoint, use_auth_token=HF_TOKEN, num_labels=2)
|
31 |
+
self.trainer = Trainer(model=self.model,tokenizer=self.tokenizer,compute_metrics=HFPretrainedModel.compute_metrics)
|
32 |
+
def predict(self, data: dict):
|
33 |
+
# Build dataset with one row
|
34 |
+
data_to_predict = Dataset.from_dict(data)
|
35 |
+
tokenized_ds = data_to_predict.map(lambda examples: self.tokenizer([examples[text_feature] if examples[text_feature] is not None else '' for text_feature in data.keys()],is_split_into_words=True,truncation=True))
|
36 |
+
predictions = self.trainer.predict(tokenized_ds)
|
37 |
+
pred_tensor = torch.tensor(predictions.predictions[0])
|
38 |
+
return softmax(pred_tensor, dim=0)
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
transformers
|
2 |
+
datasets
|
3 |
+
numpy
|
4 |
+
pandas
|
5 |
+
torch
|
tools/__init__.py
ADDED
File without changes
|
tools/__pycache__/__init__.cpython-38.pyc
ADDED
Binary file (171 Bytes). View file
|
|
tools/__pycache__/ocr_video.cpython-38.pyc
ADDED
Binary file (2.32 kB). View file
|
|
tools/__pycache__/video_tools.cpython-38.pyc
ADDED
Binary file (748 Bytes). View file
|
|
tools/facial_features.py
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
This module allows to extract facial deatures from videos
|
3 |
+
"""
|
4 |
+
import os
|
5 |
+
import shutil
|
6 |
+
from retinaface import RetinaFace
|
7 |
+
from deepface import DeepFace
|
8 |
+
import json
|
9 |
+
from video_tools import generate_frames
|
10 |
+
|
11 |
+
FRAMES_PATH = "tmp_frames_faces"
|
12 |
+
|
13 |
+
def retrieve_faces_data(video_path, rate = 50, show_print = True):
|
14 |
+
faces_lst = []
|
15 |
+
generate_frames(video_path, FRAMES_PATH, rate = rate, show_print = show_print)
|
16 |
+
for i in sorted([int(s[:-4]) for s in os.listdir(FRAMES_PATH)]):
|
17 |
+
faces = RetinaFace.extract_faces(FRAMES_PATH + "/" + str(i) + ".png")
|
18 |
+
data_lst = []
|
19 |
+
for face in faces:
|
20 |
+
try:
|
21 |
+
face_dict = DeepFace.analyze(face, actions = ["emotion"], detector_backend = "skip")
|
22 |
+
data_lst.append(face_dict["emotion"])
|
23 |
+
except ValueError:
|
24 |
+
# Face was not detected
|
25 |
+
continue
|
26 |
+
faces_lst.append(data_lst)
|
27 |
+
# Delete temporary directory
|
28 |
+
#shutil.rmtree(FRAMES_PATH)
|
29 |
+
return faces_lst
|
30 |
+
|
31 |
+
def retrieve_to_file(dest, video_path):
|
32 |
+
face_data = retrieve_faces_data(video_path, show_print = False)
|
33 |
+
with open(dest, "w") as output_file:
|
34 |
+
output_file.writelines([json.dumps(item) + "\n" for item in face_data])
|
35 |
+
|
36 |
+
def retrieve_to_files(dest, video_path):
|
37 |
+
for file_name in os.listdir(video_path):
|
38 |
+
retrieve_to_file(dest + "/" + os.path.splitext(file_name)[0] + "_data", video_path + "/" + file_name)
|
39 |
+
|
40 |
+
def restore_from_file(file_path):
|
41 |
+
restored_lst = []
|
42 |
+
with open(file_path, "r") as file:
|
43 |
+
for line in file.readlines():
|
44 |
+
if line != "":
|
45 |
+
restored_lst.append(eval(line))
|
46 |
+
return restored_lst
|
47 |
+
|
48 |
+
def data_to_vector(data):
|
49 |
+
vec = []
|
50 |
+
for frame in data:
|
51 |
+
avg = [0, 0, 0, 0, 0, 0, 0]
|
52 |
+
for face in frame:
|
53 |
+
avg[0] += face["angry"]
|
54 |
+
avg[1] += face["disgust"]
|
55 |
+
avg[2] += face["fear"]
|
56 |
+
avg[3] += face["happy"]
|
57 |
+
avg[4] += face["sad"]
|
58 |
+
avg[5] += face["surprise"]
|
59 |
+
avg[6] += face["neutral"]
|
60 |
+
if len(frame) != 0:
|
61 |
+
for i in range(7):
|
62 |
+
avg[i] /= len(frame)
|
63 |
+
vec.append(avg)
|
64 |
+
return vec
|
65 |
+
|
66 |
+
if __name__ == "__main__":
|
67 |
+
retrieve_to_files("x", "result")
|
tools/ocr_video.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
This module allows to extract texts from videos using OCR
|
3 |
+
"""
|
4 |
+
import easyocr
|
5 |
+
import os
|
6 |
+
import cv2
|
7 |
+
import shutil
|
8 |
+
import difflib
|
9 |
+
import re
|
10 |
+
from tools.video_tools import generate_frames
|
11 |
+
|
12 |
+
CONF_THRESH = 0.9
|
13 |
+
SIMILARITY_THRESH = 0.8
|
14 |
+
|
15 |
+
def process_text(text):
|
16 |
+
result = re.sub(r"[\n\"\[\]~;]", "", text)
|
17 |
+
lst = result.split()
|
18 |
+
s = ""
|
19 |
+
for item in lst:
|
20 |
+
item = item.strip()
|
21 |
+
if len(item)!=1 or item == "a" or item == "I" or item == "i" or item == "A":
|
22 |
+
s += " "+item
|
23 |
+
if len(s)<6:
|
24 |
+
s = ""
|
25 |
+
return s
|
26 |
+
|
27 |
+
def get_formated_text(texts_arr):
|
28 |
+
res = ""
|
29 |
+
for row in texts_arr:
|
30 |
+
k = process_text(row.lower())
|
31 |
+
if len(k) > 0:
|
32 |
+
res += process_text(row.lower()) + ", "
|
33 |
+
return res[:-2]
|
34 |
+
|
35 |
+
def add_text(text_lst, text):
|
36 |
+
for t in text_lst:
|
37 |
+
similarity = difflib.SequenceMatcher(None, t, text).ratio()
|
38 |
+
if similarity > SIMILARITY_THRESH:
|
39 |
+
return
|
40 |
+
text_lst.append(text)
|
41 |
+
|
42 |
+
def retrieve_text(video_path, rate = 5, frames_path = "tmp_frames", show_print = True):
|
43 |
+
texts_lst = []
|
44 |
+
generate_frames(video_path, frames_path, rate = rate, show_print = show_print)
|
45 |
+
ocr = easyocr.Reader(['en'])
|
46 |
+
for i in os.listdir(frames_path):
|
47 |
+
text = ocr.readtext(frames_path + "/" + i)
|
48 |
+
for txt in text:
|
49 |
+
# Threshold for confidence
|
50 |
+
if txt[2] > CONF_THRESH:
|
51 |
+
# Filter similar texts
|
52 |
+
add_text(texts_lst, txt[1])
|
53 |
+
# Delete temporary directory
|
54 |
+
shutil.rmtree(frames_path)
|
55 |
+
return texts_lst
|
56 |
+
|
57 |
+
def retrieve_to_file(dest, video_path):
|
58 |
+
text_lst = retrieve_text(video_path, rate = 2, show_print = False)
|
59 |
+
file = open(dest, "w")
|
60 |
+
file.writelines([line + "\n" for line in text_lst])
|
61 |
+
file.close()
|
62 |
+
|
63 |
+
def retrieve_to_files(dest, video_path):
|
64 |
+
for file_name in os.listdir(video_path):
|
65 |
+
retrieve_to_file(dest + "/" + os.path.splitext(file_name)[0] + "_text.txt", video_path + "/" + file_name)
|
tools/text_sentiment.py
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
This module contains methods for extracting text sentiment from texts
|
3 |
+
"""
|
4 |
+
import torch
|
5 |
+
import pandas as pd
|
6 |
+
import numpy as np
|
7 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer
|
8 |
+
# ref: https://colab.research.google.com/github/chrsiebert/sentiment-roberta-large-english/blob/main/sentiment_roberta_prediction_example.ipynb
|
9 |
+
# Create class for data preparation
|
10 |
+
class SimpleDataset:
|
11 |
+
def __init__(self, tokenized_texts):
|
12 |
+
self.tokenized_texts = tokenized_texts
|
13 |
+
|
14 |
+
def __len__(self):
|
15 |
+
return len(self.tokenized_texts["input_ids"])
|
16 |
+
|
17 |
+
def __getitem__(self, idx):
|
18 |
+
return {k: v[idx] for k, v in self.tokenized_texts.items()}
|
19 |
+
|
20 |
+
class Sentiment_Extractor:
|
21 |
+
def __init__(self,input_file_name,text_column,output_file_name):
|
22 |
+
self.input_file_name = input_file_name
|
23 |
+
self.text_column = text_column
|
24 |
+
self.output_file_name = output_file_name
|
25 |
+
def run(self):
|
26 |
+
# Load tokenizer and model, create trainer
|
27 |
+
model_name = "siebert/sentiment-roberta-large-english"
|
28 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
29 |
+
model = AutoModelForSequenceClassification.from_pretrained(model_name)
|
30 |
+
trainer = Trainer(model=model)
|
31 |
+
|
32 |
+
df_pred = pd.read_csv(self.input_file_name,encoding='cp1255')
|
33 |
+
pred_texts = df_pred[self.text_column].dropna().astype('str').tolist()
|
34 |
+
|
35 |
+
# Tokenize texts and create prediction data set
|
36 |
+
tokenized_texts = tokenizer(pred_texts,truncation=True,padding=True)
|
37 |
+
pred_dataset = SimpleDataset(tokenized_texts)
|
38 |
+
|
39 |
+
# Run predictions
|
40 |
+
predictions = trainer.predict(pred_dataset)
|
41 |
+
|
42 |
+
# Transform predictions to labels
|
43 |
+
preds = predictions.predictions.argmax(-1)
|
44 |
+
labels = pd.Series(preds).map(model.config.id2label)
|
45 |
+
scores = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True)).max(1)
|
46 |
+
|
47 |
+
# Create DataFrame with texts, predictions, labels, and scores
|
48 |
+
df = pd.DataFrame(list(zip(pred_texts,preds,labels,scores)), columns=['text_sentiment','pred_sentiment','label_sentiment','score_sentiment'])
|
49 |
+
df_output = df_pred.merge(df,left_on=self.text_column,right_on='text_sentiment')
|
50 |
+
del df_output['text_sentiment']
|
51 |
+
df_output.to_csv(self.output_file_name,encoding='cp1255',index=False)
|
52 |
+
|
53 |
+
if __name__ == "__main__":
|
54 |
+
# Arguments
|
55 |
+
# INPUT_FILE_NAME is the name of the input file
|
56 |
+
INPUT_FILE_NAME = "tagging_MMD_db_with_summarized.csv"
|
57 |
+
# TEXT_COLUMN is the name of the text column in the input file
|
58 |
+
# from which we extract the positive / negative sentiment by the 🤗 model.
|
59 |
+
TEXT_COLUMN = "text"
|
60 |
+
OUTPUT_FILE_NAME = 'tagging_MMD_db_with_sentiment.csv'
|
61 |
+
|
62 |
+
# Run Sentiment_Extractor on the given arguments
|
63 |
+
obj = Sentiment_Extractor(INPUT_FILE_NAME,OUTPUT_FILE_NAME)
|
64 |
+
obj.run()
|
tools/text_summarization.py
ADDED
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
This module is for text summarization
|
3 |
+
"""
|
4 |
+
# ref: https://towardsdatascience.com/understand-text-summarization-and-create-your-own-summarizer-in-python-b26a9f09fc70
|
5 |
+
import nltk
|
6 |
+
nltk.download('stopwords')
|
7 |
+
from nltk.corpus import stopwords
|
8 |
+
from nltk.cluster.util import cosine_distance
|
9 |
+
import pandas as pd
|
10 |
+
import numpy as np
|
11 |
+
import networkx as nx
|
12 |
+
|
13 |
+
class SummarizationClass:
|
14 |
+
def read_text(text):
|
15 |
+
text = text.replace("\"","")
|
16 |
+
article = text.split(". ")
|
17 |
+
sentences = []
|
18 |
+
|
19 |
+
for sentence in article:
|
20 |
+
#print(sentence)
|
21 |
+
sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" "))
|
22 |
+
#sentences.pop()
|
23 |
+
|
24 |
+
return sentences
|
25 |
+
|
26 |
+
def sentence_similarity(sent1, sent2, stopwords=None):
|
27 |
+
if stopwords is None:
|
28 |
+
stopwords = []
|
29 |
+
|
30 |
+
sent1 = [w.lower() for w in sent1]
|
31 |
+
sent2 = [w.lower() for w in sent2]
|
32 |
+
|
33 |
+
all_words = list(set(sent1 + sent2))
|
34 |
+
|
35 |
+
vector1 = [0] * len(all_words)
|
36 |
+
vector2 = [0] * len(all_words)
|
37 |
+
|
38 |
+
# build the vector for the first sentence
|
39 |
+
for w in sent1:
|
40 |
+
if w in stopwords:
|
41 |
+
continue
|
42 |
+
vector1[all_words.index(w)] += 1
|
43 |
+
|
44 |
+
# build the vector for the second sentence
|
45 |
+
for w in sent2:
|
46 |
+
if w in stopwords:
|
47 |
+
continue
|
48 |
+
vector2[all_words.index(w)] += 1
|
49 |
+
|
50 |
+
return 1 - cosine_distance(vector1, vector2)
|
51 |
+
|
52 |
+
def build_similarity_matrix(sentences, stop_words):
|
53 |
+
# Create an empty similarity matrix
|
54 |
+
similarity_matrix = np.zeros((len(sentences), len(sentences)))
|
55 |
+
|
56 |
+
for idx1 in range(len(sentences)):
|
57 |
+
for idx2 in range(len(sentences)):
|
58 |
+
if idx1 == idx2: #ignore if both are same sentences
|
59 |
+
continue
|
60 |
+
similarity_matrix[idx1][idx2] = SummarizationClass.sentence_similarity(sentences[idx1], sentences[idx2], stop_words)
|
61 |
+
|
62 |
+
return similarity_matrix
|
63 |
+
|
64 |
+
|
65 |
+
def generate_summary(text, top_n=5):
|
66 |
+
stop_words = stopwords.words('english')
|
67 |
+
summarize_text = []
|
68 |
+
|
69 |
+
# Step 1 - Read text anc split it
|
70 |
+
sentences = SummarizationClass.read_text(text)
|
71 |
+
# Step 2 - Generate Similary Martix across sentences
|
72 |
+
sentence_similarity_martix = SummarizationClass.build_similarity_matrix(sentences, stop_words)
|
73 |
+
|
74 |
+
# Step 3 - Rank sentences in similarity martix
|
75 |
+
sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_martix)
|
76 |
+
#print(sentence_similarity_graph)
|
77 |
+
try:
|
78 |
+
scores = nx.pagerank(sentence_similarity_graph)
|
79 |
+
|
80 |
+
# Step 4 - Sort the rank and pick top sentences
|
81 |
+
ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
|
82 |
+
|
83 |
+
for i in range(top_n):
|
84 |
+
summarize_text.append(" ".join(ranked_sentence[i][1]))
|
85 |
+
except nx.exception.PowerIterationFailedConvergence:
|
86 |
+
print(f'text={text} was bad for nx')
|
87 |
+
return ''
|
88 |
+
# Step 5 - Offcourse, output the summarize texr
|
89 |
+
return ". ".join(summarize_text)
|
90 |
+
|
91 |
+
class SummarizationClassRun:
|
92 |
+
''' class for running the summarization class algorithm with given parameters '''
|
93 |
+
def __init__(self,input_file_path,text_column,output_file_path_keep_original_text_column):
|
94 |
+
self.input_file_path = input_file_path
|
95 |
+
self.text_column = text_column
|
96 |
+
self.output_file_path_keep_original_text_column = output_file_path_keep_original_text_column
|
97 |
+
self.output_file_path_override_text_column = output_file_path_override_text_column
|
98 |
+
def run(self):
|
99 |
+
# read input file as a dataframe
|
100 |
+
df = pd.read_csv(self.input_file_path,encoding='cp1255')
|
101 |
+
# add column with summarization of the text in the text column
|
102 |
+
df['summarized_text'] = df[self.text_column].apply(lambda x: SummarizationClass.generate_summary(x, 1))
|
103 |
+
# export output with the original text column to CSV file
|
104 |
+
df.to_csv(self.output_file_path_keep_original_text_column,encoding='cp1255',index=False)
|
105 |
+
# override original text column
|
106 |
+
df[self.text_column] = df['summarized_text']
|
107 |
+
del df['summarized_text']
|
108 |
+
# export output with the overridden text column to CSV file
|
109 |
+
df.to_csv(self.output_file_path_override_text_column,encoding='cp1255',index=False)
|
110 |
+
|
111 |
+
if __name__ == '__main__':
|
112 |
+
# Arguments
|
113 |
+
INPUT_FILE_PATH = 'tagging_MMD_db.csv'
|
114 |
+
TEXT_COLUMN = 'text'
|
115 |
+
OUTPUT_FILE_PATH_KEEP_ORIGINAL_TEXT_COLUMN = 'tagging_MMD_db_with_summarized.csv'
|
116 |
+
OUTPUT_FILE_PATH_OVERRIDE_TEXT_COLUMN = 'summarized_tagging_MMD_db.csv'
|
117 |
+
obj = SummarizationClassRun(INPUT_FILE_PATH,TEXT_COLUMN,OUTPUT_FILE_PATH_KEEP_ORIGINAL_TEXT_COLUMN,OUTPUT_FILE_PATH_OVERRIDE_TEXT_COLUMN)
|
118 |
+
obj.run()
|
tools/video_tools.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
This module contains methods for video processing
|
3 |
+
"""
|
4 |
+
import os
|
5 |
+
import cv2
|
6 |
+
|
7 |
+
def generate_frames(video_path, frames_path, rate, show_print = True):
|
8 |
+
# Create a new temporary folder
|
9 |
+
if not os.path.exists(frames_path):
|
10 |
+
os.makedirs(frames_path)
|
11 |
+
# Capture video
|
12 |
+
src_vid = cv2.VideoCapture(video_path)
|
13 |
+
index = 0
|
14 |
+
while src_vid.isOpened():
|
15 |
+
ret, frame = src_vid.read()
|
16 |
+
if not ret:
|
17 |
+
break
|
18 |
+
name = frames_path + "/" + str(index) + ".png"
|
19 |
+
if index % rate == 0:
|
20 |
+
if show_print:
|
21 |
+
print("Frame: " + name)
|
22 |
+
cv2.imwrite(name, frame)
|
23 |
+
index = index + 1
|
24 |
+
src_vid.release()
|