{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import gravis as gv\n", "import pandas as pd\n", "import networkx as nx\n", "import matplotlib.pyplot as plt\n", "from netgraph import Graph\n", "import numpy as np\n", "from sentence_transformers import SentenceTransformer\n", "import networkx as nx\n", "from sklearn.cluster import KMeans\n", "from sklearn.metrics.pairwise import cosine_similarity\n", "model = SentenceTransformer('all-MiniLM-L6-v2')\n", "from bertopic import BERTopic\n", "from bertopic.representation import OpenAI\n", "import openai\n", "from sentence_transformers import SentenceTransformer\n", "from umap import UMAP\n", "from hdbscan import HDBSCAN\n", "from sklearn.feature_extraction.text import CountVectorizer\n", "from nltk.corpus import stopwords\n", "from community import community_louvain\n", "import pandas as pd\n", "import networkx as nx\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "from sentence_transformers import SentenceTransformer\n", "import networkx as nx\n", "from sklearn.cluster import KMeans\n", "from pyvis.network import Network\n", "from sklearn.metrics.pairwise import cosine_similarity\n", "\n", "openai.api_key = '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'" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from langchain.agents import create_pandas_dataframe_agent\n", "from langchain.llms import OpenAI\n", "import pandas as pd\n", "import os\n", "os.environ['OPENAI_API_KEY'] = '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'\n", "os.environ['SERPAPI_API_KEY'] = 'bf4a8deceac484f1cc5f425581b8a4ae35c7cbb4fd61bbb74880e411300b913e'\n", "os.environ['WOLFRAM_ALPHA_APPID'] = 'LJER8Q-3QWH24T2PX'\n", "\n", "\n", "\n", "def get_memory():\n", " memory_string = ''\n", " for i,j in memory.items():\n", " print(i, j)\n", " memory_string += str(j) + '\\n'\n", " return memory_string\n", "\n", "\n", "\n", "\n", "def check_words_in_string(word_list, input_string, case=False):\n", "\n", " input_string = input_string.lower()\n", "\n", " # Convert words to lowercase if case is False\n", " word_list = [word.lower() if case else word for word in word_list]\n", "\n", " # Check if any word is in the input_string\n", " result = any(word in input_string for word in word_list)\n", "\n", " # check if True\n", " if result:\n", " return True\n", " else:\n", " return False\n", "\n", "\n", "# Will be used by the Langchain chatbot\n", "\n", "words = ['rows', 'data', 'length', 'dataset','plot', 'col','columns','column', 'max', 'min', 'minimum', 'maximum', 'visualize','visualise','represent','graph','chart','plot','diagram','illustrate','show','depict','display','count','number','sum','total','aggregate','trend','pattern','distribution','average','linechart','scatter','barchart','piechart','histogram','boxplot','heatmap','correlation','regression','forecast','predict']\n", "\n", "memory = {'agent':[], 'user':[]}\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[nltk_data] Downloading package vader_lexicon to\n", "[nltk_data] /Users/dataexpert/nltk_data...\n", "[nltk_data] Package vader_lexicon is already up-to-date!\n" ] } ], "source": [ "import gradio as gr\n", "import requests\n", "import pyaudio\n", "# import io\n", "# from io import BytesIOscr\n", "import librosa\n", "import numpy as np\n", "import json\n", "\n", "import numpy as np\n", "import tiktoken\n", "import pandas as pd\n", "\n", "from wordcloud import WordCloud\n", "import matplotlib.pyplot as plt\n", "\n", "import umap\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "import hdbscan\n", "import plotly.express as px\n", "import plotly.graph_objects as go\n", "from PIL import Image\n", "\n", "import seaborn as sns\n", "from textblob import TextBlob\n", "import matplotlib.font_manager as font_manager\n", "font = font_manager.FontProperties(family='Noto Serif')\n", "\n", "import os\n", "from langchain.agents import load_tools\n", "from langchain.agents import initialize_agent\n", "from langchain.llms import OpenAI\n", "import pinecone\n", "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", "from langchain.utilities import WikipediaAPIWrapper\n", "import gradio as gr\n", "import openai\n", "import pandas as pd\n", "import numpy as np\n", "import re\n", "import whisper\n", "import time\n", "\n", "import openai\n", "\n", "import pandas as pd\n", "\n", "#from spacy import displacy\n", "import networkx as nx\n", "import matplotlib.pyplot as plt\n", "import scipy\n", "\n", "from pyvis.network import Network\n", "import networkx as nx\n", "# import cmap\n", "import matplotlib.pyplot as plt\n", "from matplotlib import cm\n", "\n", "from langchain import OpenAI, PromptTemplate, LLMChain\n", "\n", "from langchain.agents import load_tools\n", "from langchain.agents import initialize_agent\n", "from langchain.llms import OpenAI\n", "from langchain.document_loaders import YoutubeLoader\n", "from langchain.chains.summarize import load_summarize_chain\n", "\n", "from langchain.text_splitter import RecursiveCharacterTextSplitter, TokenTextSplitter\n", "from langchain.vectorstores import Pinecone\n", "from langchain.chains.question_answering import load_qa_chain\n", "from langchain.chains.qa_with_sources import load_qa_with_sources_chain\n", "from langchain.chains.question_answering import load_qa_chain\n", "from langchain.chains import SimpleSequentialChain\n", "from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification\n", "from langchain.document_loaders import YoutubeLoader\n", "import time\n", "import re\n", "import pinecone\n", "import pandas as pd\n", "from sentence_transformers import SentenceTransformer, util\n", "\n", "#import numpy as np\n", "from langchain.embeddings import HuggingFaceEmbeddings, OpenAIEmbeddings\n", "from sklearn.metrics import silhouette_score\n", "import torch\n", "import nltk\n", "nltk.download('vader_lexicon')\n", "\n", "\n", "def get_key(item):\n", " return item['label']\n", "\n", "def get_emotion_bertweet(dataset):\n", " tokenizer4 = AutoTokenizer.from_pretrained(\"finiteautomata/bertweet-base-emotion-analysis\", truncation=True)\n", " model4 = AutoModelForSequenceClassification.from_pretrained(\"finiteautomata/bertweet-base-emotion-analysis\")\n", " nlp = pipeline('sentiment-analysis', model=model4,\n", " tokenizer=tokenizer4, top_k=6, truncation=True, device=0)\n", "\n", " top_emotion = []\n", " # apply emotion model on data and get the labels and scores\n", " for i in range(len(dataset)):\n", " label = []\n", " score = []\n", " jsonfile = (nlp(dataset['translated_text'].iloc[i]))\n", " jsonfile[0].sort(key=get_key)\n", " for j in range(0, 6):\n", " jsonfile2 = np.array(jsonfile)\n", " label.append(jsonfile2[0][j]['label'])\n", " score.append(jsonfile2[0][j]['score'])\n", "\n", " top_emotion.append(label[score.index(max(score))])\n", " dataset['top_emotion_bertweet'] = top_emotion\n", " return dataset\n", "\n", "\n", "model_name = \"sentence-transformers/all-MiniLM-L6-v2\"\n", "hf = HuggingFaceEmbeddings(model_name=model_name)\n", "os.environ['OPENAI_API_KEY'] = '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'\n", "os.environ['SERPAPI_API_KEY'] = 'bf4a8deceac484f1cc5f425581b8a4ae35c7cbb4fd61bbb74880e411300b913e'\n", "os.environ['WOLFRAM_ALPHA_APPID'] = 'LJER8Q-3QWH24T2PX'\n", "\n", "embeddings = OpenAIEmbeddings()\n", "\n", "# pinecone.init(\n", "# api_key='ENTER API KEY HERE',\n", "# environment='us-central1-gcp'\n", "# )\n", "# index_name = 'openaigradio'\n", "\n", "tokenizer4 = AutoTokenizer.from_pretrained(\"finiteautomata/bertweet-base-emotion-analysis\", truncation=True)\n", "model4 = AutoModelForSequenceClassification.from_pretrained(\"finiteautomata/bertweet-base-emotion-analysis\")\n", "\n", "openai.api_key = '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'\n", "model_whisp = whisper.load_model(\"base\")\n", "\n", "llm = OpenAI(temperature=0.2, model_name='text-davinci-003', max_tokens=1000, top_p=1)\n", "\n", "model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')\n", "# check if cpu or gpu\n", "device = 'cuda' if torch.cuda.is_available() else 'cpu'\n", "model = model.to(device)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "Industries = ['Agriculture',\n", " 'Automobile Manufacturing',\n", " 'Banking and Finance',\n", " 'Biotechnology',\n", " 'Chemicals and Petrochemicals',\n", " 'Construction and Engineering',\n", " 'Consumer Goods and Retail',\n", " 'Education',\n", " 'Electronics',\n", " 'Energy (Oil, Gas, Coal, and Renewable Energy)',\n", " 'Entertainment and Media',\n", " 'Food and Beverage',\n", " 'Healthcare and Pharmaceuticals',\n", " 'Hospitality, Travel, and Tourism',\n", " 'Information Technology (IT) and Software',\n", " 'Insurance',\n", " 'Manufacturing (various sectors)',\n", " 'Mining and Metals',\n", " 'Real Estate',\n", " 'Renewable Energy (Solar, Wind, Hydro, Geothermal)',\n", " 'Telecommunications',\n", " 'Textiles and Apparel',\n", " 'Transportation and Logistics',\n", " 'Utilities (Electricity, Water, Gas)',\n", " 'Waste Management and Environmental Services']\n", "\n", "\n", " \n", "def load_csv(csv_file=None):\n", " if csv_file.name.endswith('.csv'):\n", " df = pd.read_csv(\n", " csv_file.name, delimiter=',', encoding='utf-8-sig')\n", " df.fillna('', inplace=True)\n", " elif csv_file.name.endswith('.xlsx'):\n", " df = pd.read_excel(csv_file.name)\n", " df.fillna('', inplace=True)\n", " elif csv_file.name.endswith('.h5'): \n", " try:\n", " df = pd.read_hdf(csv_file.name, key='df')\n", " df.fillna('', inplace=True)\n", " except:\n", " df = pd.read_hdf(csv_file.name)\n", " df.fillna('', inplace=True)\n", "\n", " if 'translated_text' not in df.columns:\n", " try:\n", " df['translated_text'] = df['clean_text_emotions']\n", " except:\n", " try:\n", " df['translated_text'] = df['split_summary_y']\n", " except:\n", " try:\n", " df['translated_text'] = df['split_summary']\n", " except:\n", " try:\n", " df['translated_text'] = df['clean_text']\n", " except:\n", " try:\n", " df['translated_text'] = df['text']\n", " except:\n", " try:\n", " df['translated_text'] = df['content']\n", " except:\n", " try:\n", " df['translated_text'] = df['Comment']\n", " except:\n", " try:\n", " df['translated_text'] = df['summary']\n", " except:\n", " df['translated_text'] = df['body']\n", "\n", "\n", " if 'embeddings' not in df.columns:\n", "\n", " if 'encoding' in df.columns:\n", " if type(df['encoding'][0]) == str:\n", " embeddings_array = np.vstack(df['encoding'].apply(\n", " lambda x: np.fromstring(x[1:-1], sep=' ', dtype=np.float32)))\n", " df['embeddings'] = list(embeddings_array)\n", " else:\n", " df['embeddings'] = df['encoding']\n", " \n", " elif 'paragraph_embeddings' in df.columns:\n", " if type(df['paragraph_embeddings'][0]) == str:\n", " embeddings_array = np.vstack(df['paragraph_embeddings'].apply(\n", " lambda x: np.fromstring(x[1:-1], sep=' ', dtype=np.float32)))\n", " df['embeddings'] = list(embeddings_array)\n", " else:\n", " df['embeddings'] = df['paragraph_embeddings']\n", " \n", " else:\n", " df['embeddings'] = df['translated_text'].apply(\n", " lambda x: model.encode(x))\n", " else:\n", " if type(df['embeddings'][0]) == str:\n", " embeddings_array = np.vstack(df['embeddings'].apply(\n", " lambda x: np.fromstring(x[1:-1], sep=' ', dtype=np.float32)))\n", " df['embeddings'] = list(embeddings_array)\n", " else:\n", " pass\n", "\n", " finished_loading = 'Data has been loaded successfully'\n", " \n", "\n", "\n", " return df, finished_loading, df[['translated_text']][0:30]\n", "\n", "\n", "def topic_modelling(mydataset):\n", " \n", " mydataset['translated_text'] = mydataset['translated_text'].apply(lambda x: str(x))\n", "\n", " prompt = \"\"\"\n", " I have a topic that contains the following documents: \n", " [DOCUMENTS]\n", " The topic is described by the following keywords: [KEYWORDS]\n", "\n", " Based on the information above, extract a short topic label in the following format:\n", " topic: \n", " \"\"\"\n", "\n", " from sentence_transformers import SentenceTransformer\n", " from umap import UMAP\n", " from hdbscan import HDBSCAN\n", "\n", " umap_model = UMAP(n_neighbors=15, n_components=5, metric='cosine')\n", "\n", " hdbscan_model = HDBSCAN(min_cluster_size=10,\n", " gen_min_span_tree=True,\n", " prediction_data=True)\n", "\n", " from sklearn.feature_extraction.text import CountVectorizer\n", " from nltk.corpus import stopwords\n", "\n", " stopwords = list(stopwords.words('english')) + ['http', 'https', 'amp', 'com']\n", "\n", " #we add this to remove stopwords that can pollute topcs\n", " vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words=stopwords)\n", "\n", "\n", " #convert to numpy array\n", "\n", "\n", " #representation_model = OpenAI(model=\"gpt-3.5-turbo\", delay_in_seconds=.5, chat=True)\n", "\n", "\n", " topic_model = BERTopic(\n", " umap_model=umap_model,\n", " hdbscan_model=hdbscan_model,\n", " embedding_model=model,\n", " vectorizer_model=vectorizer_model,\n", " #representation_model=representation_model,\n", " top_n_words=5,\n", " language='english',\n", " calculate_probabilities=True,\n", " verbose=True\n", " )\n", "\n", " docs = mydataset['translated_text'].unique()\n", " topics, props = topic_model.fit_transform(docs)\n", "\n", " mydataset_topics = topic_model.get_document_info(docs)\n", "\n", " mydataset_topics.rename(columns={'Topic': 'cluster'}, inplace=True)\n", " mydataset_topics.rename(columns={'Document': 'translated_text'}, inplace=True)\n", " mydataset_topics.rename(columns={'Name': 'cluster_name'}, inplace=True)\n", "\n", " mydataset_topics = mydataset_topics[['cluster', 'translated_text', 'cluster_name']]\n", " merged_mydataset = pd.merge(mydataset.drop_duplicates(subset='translated_text'), mydataset_topics, on=['translated_text'])\n", " \n", " return merged_mydataset, topic_model, topics, docs\n", "\n", "\n", "def get_temporal_evolution_of_topics(dataframe):\n", "\n", " merged_mydataset, topic_model, topics, docs = topic_modelling(dataframe)\n", " timestamps = merged_mydataset.date.tolist()\n", "\n", " topics_over_time = topic_model.topics_over_time(docs, timestamps, nr_bins=20)\n", " fig = topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=10)\n", "\n", " return fig\n", " \n", "\n", "\n", "# The get_cluster function is a Python script that clusters and visualizes data in 3D using UMAP and HDBSCAN algorithms. \n", "# The script takes a CSV file as input, reads it, and preprocesses the data by filling empty values with an empty string, cleaning the text, and generating embeddings.\n", "\n", "# It then applies UMAP to reduce the dimensionality of the data to 3D and HDBSCAN for clustering. \n", "# The function generates a scatter plot of the data points in 3D, with each point colored according to its assigned cluster.\n", "\n", "# The function also uses OpenAI's GPT-3.5-turbo model to assign a relevant topic name to each cluster. \n", "# It does so by selecting 50 random documents from each cluster and using them to generate a prompt that asks the model to assign the most relevant topic name for the cluster.\n", "\n", "# Finally, the function generates a 3D scatter plot of the clustered data points with each point colored according to its assigned cluster name. \\\n", "# The plot can be customized by adjusting the dot size and adding an image.\n", "\n", "\n", "\n", "def get_emotion_graph(dataframe):\n", "\n", " def get_key(element):\n", " return element['label']\n", "\n", " tokenizer = AutoTokenizer.from_pretrained(\n", " \"joeddav/distilbert-base-uncased-go-emotions-student\", truncation=True)\n", " model2 = AutoModelForSequenceClassification.from_pretrained(\n", " \"joeddav/distilbert-base-uncased-go-emotions-student\")\n", " nlp = pipeline('sentiment-analysis', model=model2,\n", " tokenizer=tokenizer, top_k=27, truncation=True, device=0)\n", "\n", " df = dataframe.copy()\n", "\n", " # some cleaning for reddit datasets\n", " df['text'] = df.translated_text\n", " df = df[df['text'] != 'nan']\n", " df = df[df['text'] != '[deleted]']\n", " df = df[df['text'] != '[removed]']\n", "\n", " dataset_umap = pd.DataFrame(\n", " columns=['text', 'emotion', 'score', 'top_emotion'])\n", "\n", " # apply emotion model on data and get the labels and scores\n", " for i in range(len(df)):\n", " label = []\n", " score = []\n", " jsonfile = (nlp(df['text'].iloc[i]))\n", " jsonfile[0].sort(key=get_key)\n", " for j in range(0, 27):\n", " jsonfile2 = np.array(jsonfile)\n", " label.append(jsonfile2[0][j]['label'])\n", " score.append(jsonfile2[0][j]['score'])\n", "\n", " dataset_umap.loc[len(dataset_umap)] = [df['text'].iloc[i], label, score, label[score.index(\n", " max(score))]]\n", "\n", " df['top_emotion'] = dataset_umap['top_emotion']\n", "\n", " # df_table = dataset_umap[dataset_umap['top_emotion'] == 'caring'].to_html()\n", "\n", " # return render(request, 'tweeter/reddit.html', {'df': df_table, 'subreddit': subreddit}, content_type='text/html')\n", "\n", " dataset_umap['top_emotion_number'] = dataset_umap['top_emotion'].astype(\n", " 'category').cat.codes\n", "\n", " dataset_umap['trimmed_text'] = dataset_umap['text'].str[:175]\n", "\n", " dataset_umap = dataset_umap.sort_values(by=['top_emotion'])\n", "\n", " emotions = {\n", " \"admiration\": \"#8F00FF\",\n", " \"amusement\": \"#98FB98\",\n", " \"anger\": \"#F30000\",\n", " \"annoyance\": \"#997950\",\n", " \"approval\": \"#0018F9\",\n", " \"caring\": \"#FFA6C9\",\n", " \"confusion\": \"#5097A4\",\n", " \"curiosity\": \"#FF2400\",\n", " \"desire\": \"#FA8072\",\n", " \"disgust\": \"#800000\",\n", " \"disappointment\": \"#828282\",\n", " \"disapproval\": \"#000080\",\n", " \"embarrassment\": \"#C49102\",\n", " \"excitement\": \"#FC6600\",\n", " \"fear\": \"#50C878\",\n", " \"gratitude\": \"#89CFEF\",\n", " \"grief\": \"#702963\",\n", " \"joy\": \"#FFF200\",\n", " \"love\": \"#FF0090\",\n", " \"nervousness\": \"#FCF4A3\",\n", " \"optimism\": \"#3BB143\",\n", " \"pride\": \"#F9A602\",\n", " \"realization\": \"#D9DDDC\",\n", " \"relief\": \"#FFE5B4\",\n", " \"remorse\": \"#8F9779\",\n", " \"sadness\": \"#4B5320\",\n", " \"surprise\": \"#C7EA46\",\n", " }\n", "\n", " emotions_csv = pd.DataFrame(emotions.items(), columns=['Emotion', 'Color'])\n", "\n", "\n", " # if len(dataset_umap) > 1000:\n", "\n", " top_10_emotions = dataset_umap['top_emotion'].value_counts().head(10).index\n", " dataset_umap = dataset_umap[dataset_umap['top_emotion'].isin(top_10_emotions)]\n", "\n", "\n", " # dataset_umap = dataset_umap[dataset_umap['top_emotion'].isin(\n", " # dataset_umap['top_emotion'].value_counts().head(10).index)]\n", "\n", " # Uncomment to save file\n", " # dataset_umap.to_hdf('df_emotion.h5', key='df', mode='w')\n", "\n", " reduce_dim = umap.UMAP(\n", " n_components=3, n_neighbors=8, min_dist=0.95)\n", " embedding = dataset_umap['score'].tolist()\n", " embedding = np.array(embedding)\n", " umap_embeddings = reduce_dim.fit_transform(\n", " embedding, y=dataset_umap['top_emotion_number'])\n", "\n", " dataset_umap['x'] = umap_embeddings[:, 0]\n", " dataset_umap['y'] = umap_embeddings[:, 1]\n", " dataset_umap['z'] = umap_embeddings[:, 2]\n", "\n", " \n", "\n", " # assign colors to the top 10 emotions\n", " # colors = emotions_csv[emotions_csv['Emotions'].isin(\n", " # dataset_umap['top_emotion'].value_counts().head(10).index)].iloc[:, 1].values.tolist()\n", "\n", "\n", " colors = emotions_csv[emotions_csv['Emotion'].isin(top_10_emotions)].iloc[:, 1].values.tolist()\n", "\n", "\n", " if len(dataset_umap) > 10000:\n", " dot_size = 1\n", " else:\n", " dot_size = 1.5\n", "\n", " # elif len(dataset_umap) <= 1000:\n", " # reduce_dim = umap.UMAP(\n", " # n_components=3, n_neighbors=8, min_dist=0.95)\n", " # embedding = dataset_umap['score'].tolist()\n", " # embedding = np.array(embedding)\n", " # umap_embeddings = reduce_dim.fit_transform(\n", " # embedding, y=dataset_umap['top_emotion_number'])\n", "\n", " # dataset_umap['x'] = umap_embeddings[:, 0]\n", " # dataset_umap['y'] = umap_embeddings[:, 1]\n", " # dataset_umap['z'] = umap_embeddings[:, 2]\n", " # # assign colors to the top 10 emotions\n", " # colors = emotions_csv.iloc[:, 1].values.tolist()\n", " # dot_size = 2\n", "\n", "\n", " fig = px.scatter_3d(dataset_umap, x='x', y='y', z='z', color='top_emotion', hover_name='trimmed_text', hover_data={\n", " 'x': False, 'y': False, 'z': False, 'top_emotion': False, 'text': False}, color_discrete_sequence=colors, opacity=1, template='plotly_white')\n", "\n", " fig.update_traces(marker=dict(size=dot_size))\n", "\n", " fig.add_trace(go.Scatter3d(x=[0], y=[0], z=[0], mode='markers', marker=dict(\n", " size=0.1, color='black'), showlegend=True, name=' ', hoverinfo='none'))\n", "\n", " # legend on the right side\n", " fig.update_layout(legend=dict(\n", " bgcolor='rgba(17,17,17,0)', xanchor='right'))\n", "\n", " fig.update_layout(scene=dict(\n", " xaxis=dict(\n", " title=' ',\n", " nticks=0,\n", " # backgroundcolor=\"rgb(0, 0, 0, 1)\",\n", " gridcolor=\"rgba(17,17,17, 0)\",\n", " showbackground=True,\n", " zerolinecolor=\"rgba(17,17,17, 0)\",\n", " zeroline=False,\n", " showgrid=False,\n", " showticklabels=False,\n", " showspikes=False\n", " ),\n", " # hide ticks\n", "\n", "\n", " yaxis=dict(\n", " # name\n", " title=' ',\n", " nticks=0,\n", " # backgroundcolor=\"rgb(0, 0, 0, 1)\",\n", " gridcolor=\"rgba(17,17,17, 0)\",\n", " showbackground=True,\n", " zerolinecolor=\"rgba(17,17,17, 0)\",\n", " zeroline=False,\n", " showgrid=False,\n", " showticklabels=False,\n", " showspikes=False\n", " ),\n", "\n", "\n", "\n", " zaxis=dict(\n", " # name\n", " title=' ',\n", " nticks=0,\n", " # backgroundcolor=\"rgba(0, 0, 0, 1)\",\n", " gridcolor=\"rgba(17,17,17, 0)\",\n", " showbackground=True,\n", " zerolinecolor=\"rgba(17,17,17, 0)\",\n", " zeroline=False,\n", " showgrid=False,\n", " showticklabels=False,\n", " showspikes=False),)\n", " # tickvals=[],),\n", " )\n", "\n", " fig.update_layout(coloraxis_showscale=False, legend=dict(x=0.1, y=0.5, traceorder='normal', font=dict(\n", " family='Noto Serif', size=14, color='black'), bgcolor='rgba(17,17,17,0)', bordercolor='rgba(17,17,17,0)', borderwidth=0))\n", "\n", "\n", " \n", "\n", " # , margin=dict(l=0, r=0, t=0, b=0, pad=0))\n", "\n", " fig.update_layout(legend={'itemsizing': 'constant'}, legend_title_text=' ', legend_title_font_color='black', legend_title_font_family='Noto Serif',\n", " legend_font_color='black', legend_font_size=14, legend_font_family='Noto Serif', legend_bgcolor='rgba(17,17,17,0)', legend_bordercolor='rgba(17,17,17,0)', legend_borderwidth=2)\n", "\n", " # , title_font_size=30, title_font_family='Noto Serif', title_font_color='white', title_x=0.06, title_y=0.95, title_xanchor='left', title_yanchor='top', title_text='Cluster of Emotions for {}\\n n = {}'.format(subreddit, len(dataset_umap)), margin=dict(l=0, r=0, b=0, t=0, pad=0))\n", " fig.update_layout(scene_camera_eye=dict(x=0.87, y=-0.88, z=0.84), scene_camera_center=dict(\n", " x=0, y=0, z=0), template='plotly_white', hovermode='x unified', margin=dict(l=0, r=0, b=0, t=0, pad=2))\n", "\n", " fig.update_layout(coloraxis_showscale=False)\n", " fig.update_xaxes(showticklabels=False, showgrid=False, zeroline=False,\n", " showline=False, automargin=False, showspikes=False)\n", " fig.update_yaxes(showticklabels=False, showgrid=False, zeroline=False,\n", " showline=False, automargin=False, showspikes=False)\n", "\n", "\n", " emotion_count = pd.DataFrame(columns=['emotion','count', 'proportions'])\n", "\n", " emotion_count['emotion'] = df['top_emotion'].value_counts().index\n", " emotion_count['count'] = df['top_emotion'].value_counts().values\n", " emotion_count['proportions'] = df['top_emotion'].value_counts(normalize=True).values*100\n", " emotion_count['proportions'] = emotion_count['proportions'].round(3)\n", " emotion_count['proportions'] = emotion_count['proportions'].astype(str) + '%'\n", "\n", " #emotion_count = dataset_umap['top_emotion'].value_counts(normalize=True).round(3).reset_index()\n", " # as a pd dataframe\n", " \n", "\n", " return fig, df, emotion_count\n", "\n", "\n", "def get_graph2():\n", " demo = pd.read_excel('C:/Users/sinan/Downloads/profdemo_cleaned.xlsx')\n", "\n", "\n", " embeddings_array = np.vstack(demo['embeddings'].apply(\n", " lambda x: np.fromstring(x[1:-1], sep=' ', dtype=np.float32)))\n", "\n", "\n", " num_clusters = 4 # Adjust the number of clusters as needed\n", " kmeans = KMeans(n_clusters=num_clusters, random_state=42)\n", " cluster_labels = kmeans.fit_predict(embeddings_array)\n", " cos_sim_matrix = cosine_similarity(embeddings_array)\n", "\n", " # plot heatmap\n", " plt.figure(figsize=(4, 3))\n", " plt.imshow(cos_sim_matrix, cmap='jet', interpolation='nearest')\n", " plt.colorbar()\n", " plt.title('Cosine Similarity SBERT Matrix')\n", " plt.show()\n", "\n", "\n", " demo['text'] = demo['text'].apply(lambda x: str(x))\n", " sentences = demo['text'].tolist()\n", "\n", "\n", " def normalize_weight(weight, min_weight, max_weight):\n", " return (weight - min_weight) / (max_weight - min_weight)\n", "\n", "\n", " scaling_factor = 20\n", "\n", " G = nx.DiGraph()\n", "\n", "\n", " for idx, label in enumerate(cluster_labels):\n", " G.add_node(idx, sentence=sentences[idx], cluster=label)\n", "\n", " for i in range(len(sentences)):\n", " for j in range(len(sentences)):\n", " if i != j:\n", " if cos_sim_matrix[i, j] > 0.25:\n", " G.add_edge(i, j, weight=cos_sim_matrix[i, j])\n", " else:\n", " continue\n", "\n", " plt.figure(figsize=(10, 10))\n", "\n", " #pos = nx.spring_layout(G, k=0.5, iterations=50)\n", " pos = nx.kamada_kawai_layout(G, weight='weight')\n", "\n", " #nx.draw(G, pos, node_size=300, alpha=0.5, node_color='blue', with_labels=True, edge_color='black')\n", " nx.draw(G, pos, node_size=1000, font_size=10, with_labels=True, labels=nx.get_node_attributes(G, 'sentence'), node_shape='s', node_color='none', width=0.5,\n", " edge_color='grey', arrowsize=10, arrowstyle='->', font_color='black', bbox=dict(facecolor=\"skyblue\", boxstyle='round', edgecolor='grey', alpha=0.5, pad=0.5))\n", "\n", "\n", " min_weight = min((data['weight'] for _, _, data in G.edges(data=True)))\n", " max_weight = max((data['weight'] for _, _, data in G.edges(data=True)))\n", "\n", " all_weights = []\n", "\n", "\n", " for i, j, data in G.edges(data=True):\n", " weight = normalize_weight(data['weight'], min_weight, max_weight)\n", " nx.draw_networkx_edges(G, pos, edgelist=[(\n", " i, j)], width=weight * scaling_factor, alpha=0.5, edge_color='skyblue')\n", " all_weights.append(weight*20)\n", "\n", "\n", " G_undirected = G.to_undirected()\n", "\n", " node_to_community = community_louvain.best_partition(G_undirected)\n", " for i, j, data in G.edges(data=True):\n", " weight = normalize_weight(data['weight'], min_weight, max_weight)\n", "\n", "\n", " # Add this function to convert int32 data to int\n", " def convert_int32_to_int(G):\n", " for node in G.nodes:\n", " for key, value in G.nodes[node].items():\n", " if isinstance(value, np.int32):\n", " G.nodes[node][key] = int(value)\n", " for edge in G.edges:\n", " for key, value in G.edges[edge].items():\n", " if isinstance(value, np.int32):\n", " G.edges[edge][key] = int(value)\n", " return G\n", "\n", "\n", " net = Network(notebook=True, height='750px', width='100%',\n", " bgcolor='#ffffff', font_color='black')\n", "\n", "\n", " G = convert_int32_to_int(G) # Call the function to convert int32 data to int\n", "\n", " net.from_nx(G)\n", "\n", "\n", " net = Network(notebook=True, height='1200px', width='100%',\n", " bgcolor='#ffffff', font_color='black')\n", "\n", " node_degree = dict(G.degree)\n", " node_community = node_to_community\n", " node_label = dict(G.nodes(data='sentence'))\n", "\n", " nx.set_node_attributes(G, node_degree, 'size')\n", " #nx.set_node_attributes(G, node_label, 'label')\n", " # or for hover\n", " nx.set_node_attributes(G, node_label, 'title')\n", " nx.set_node_attributes(G, node_community, 'group')\n", " nx.set_edge_attributes(G, all_weights, 'width')\n", "\n", "\n", " net.from_nx(G)\n", " fig = net.html\n", "\n", " return fig\n", "\n", "\n", "\n", "def get_graph(dataframe):\n", "\n", " from sklearn.cluster import KMeans\n", " from sklearn.metrics.pairwise import cosine_similarity\n", " \n", "\n", " embeddings_array = dataframe['embeddings'].tolist()\n", "\n", " num_clusters = 3 # Adjust the number of clusters as needed\n", " kmeans = KMeans(n_clusters=num_clusters, random_state=42)\n", " cluster_labels = kmeans.fit_predict(embeddings_array)\n", "\n", "\n", " sentences = dataframe['translated_text'].tolist()\n", "\n", " G = nx.DiGraph()\n", "\n", " cos_sim_matrix = cosine_similarity(embeddings_array)\n", "\n", "\n", " for idx, label in enumerate(cluster_labels):\n", " G.add_node(idx, sentence=sentences[idx], cluster=label)\n", "\n", " for i in range(len(sentences)):\n", " for j in range(len(sentences)):\n", " if i != j:\n", " #if cos_sim_matrix[i, j] > 0.8:\n", " G.add_edge(i, j, weight=cos_sim_matrix[i, j])\n", " # else:\n", " # continue\n", "\n", " plt.figure(figsize=(10, 10))\n", "\n", " pos = nx.spring_layout(G, k=0.5, iterations=50)\n", "\n", "\n", " G_undirected = G.to_undirected()\n", "\n", " from community import community_louvain\n", " node_to_community = community_louvain.best_partition(G_undirected)\n", "\n", " community_to_color = {\n", " 0 : 'tab:pink',\n", " 1 : 'tab:orange',\n", " 2 : 'tab:purple',\n", " 3 : 'tab:blue',\n", " }\n", "\n", " node_color = {node: community_to_color[community_id] for node, community_id in node_to_community.items()}\n", "\n", "\n", " reducer = umap.UMAP(n_components=2, random_state=42)\n", " embeddings_2d = reducer.fit_transform(embeddings_array)\n", "\n", " def normalize_weight(weight, min_weight, max_weight):\n", " return (weight - min_weight) / (max_weight - min_weight)\n", " \n", "\n", " def visualize_graph_plotly(graph, embeddings_2d, scaling_factor=3):\n", "\n", " min_weight = min((data['weight'] for _, _, data in graph.edges(data=True)))\n", " max_weight = max((data['weight'] for _, _, data in graph.edges(data=True)))\n", "\n", " fig = go.Figure()\n", "\n", " # Add edges with width based on the normalized weights\n", " for i, j in graph.edges():\n", " weight = normalize_weight(graph[i][j]['weight'], min_weight, max_weight)\n", " fig.add_shape(\n", " type=\"line\",\n", " x0=embeddings_2d[i][0],\n", " x1=embeddings_2d[j][0],\n", " y0=embeddings_2d[i][1],\n", " y1=embeddings_2d[j][1],\n", " yref=\"y\",\n", " xref=\"x\",\n", " line=dict(color=\"rgba(211, 211, 211, 0.5)\", width=weight * scaling_factor),\n", " )\n", "\n", " # Add nodes\n", " for idx, emb in enumerate(embeddings_2d):\n", " closeness = nx.closeness_centrality(G)[idx]\n", " degree = nx.degree_centrality(G)[idx]\n", " betweenness = nx.betweenness_centrality(G)[idx]\n", " eigen = nx.eigenvector_centrality(G)[idx]\n", " fig.add_trace(\n", " go.Scatter(\n", " x=[emb[0]],\n", " y=[emb[1]],\n", " mode=\"markers+text\",\n", " text=[graph.nodes[idx][\"sentence\"]],\n", " textposition=\"bottom center\",\n", " marker=dict(color=node_color[idx][4:], size=closeness * 40),\n", " # add closeness, degree, betweenness and sentence as hover text\n", " hovertext=[f\"{graph.nodes[idx]['sentence']}
closeness_centrality: {closeness:.2f}
degree_centrality: {degree:.2f}
betweenness_centrality: {betweenness:.2f}
eigenvector_centrality: {eigen:.2f}\"],\n", " )\n", " )\n", "\n", "\n", "\n", " fig.update_layout(showlegend=False, plot_bgcolor=\"white\", width=1200, height=800)\n", " fig.update_xaxes(showticklabels=False, showgrid=False, zeroline=False,\n", " showline=False, automargin=False, showspikes=False)\n", " fig.update_yaxes(showticklabels=False, showgrid=False, zeroline=False,\n", " showline=False, automargin=False, showspikes=False)\n", " \n", " fig.update_layout(title_text=\"Test Graph Visualization\", title_x=0.5, title_font_size=30, title_font_color='black')\n", "\n", "\n", " fig.show()\n", "\n", " # add title \n", "\n", " return fig\n", "\n", " return visualize_graph_plotly(G, embeddings_2d, scaling_factor = 10)\n", "\n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "def get_cluster(dataframe):\n", " \n", " reduce_dim = umap.UMAP(\n", " n_components=3, n_neighbors=8, min_dist=0.55)\n", "\n", " df = dataframe.copy()\n", "\n", " # some cleaning for reddit datasets\n", " df = df[df['translated_text'] != 'nan']\n", " df = df[df['translated_text'] != '[deleted]']\n", " df = df[df['translated_text'] != '[removed]']\n", "\n", " def CleanTxt_quotes(text):\n", " text = re.sub(r'https?:\\/\\/\\S+', '', text) # Remove hyperlinks\n", " text = re.sub(r'http?:\\/\\/\\S+', '', text) # Remove hyperlinks\n", " # if more than 5 mentions, remove all mention\n", " if len(re.findall(r'@[A-Za-z0-9]+', text)) > 5:\n", " text = re.sub(r'@[A-Za-z0-9]+', '', text)\n", " # if more than 4 hashtags, remove all hashtags\n", " #text = re.sub(r'[^A-Za-z0-9.!?_#@]+', ' ', text) # Remove non-alphanumeric characters except exclamation marks and question marks\n", " text = re.sub(r'\\s+', ' ', text) # Remove extra whitespace\n", " return text\n", "\n", " df['clean_text'] = df['translated_text'].apply(lambda x: str(x))\n", " df['clean_text'] = df['translated_text'].apply(lambda x: CleanTxt_quotes(x))\n", "\n", "\n", " embedding = np.array([np.array(xi)\n", " for xi in df.embeddings])\n", "\n", " umap_embeddings = reduce_dim.fit_transform(embedding)\n", "\n", " print('umap_embeddings', umap_embeddings.shape)\n", "\n", " # CHECK THIS LINE\n", " df['x'] = umap_embeddings[:, 0]\n", " df['y'] = umap_embeddings[:, 1]\n", " df['z'] = umap_embeddings[:, 2]\n", "\n", "\n", " df.dropna(inplace=True)\n", "\n", " hdbscan_minimal_cluster_size = int(len(df) * 0.01+10)\n", " hdbscan_min_samples = 12\n", "\n", " cluster = hdbscan.HDBSCAN(\n", " min_cluster_size=hdbscan_minimal_cluster_size,\n", " metric='euclidean',\n", " cluster_selection_epsilon=0.01,\n", " cluster_selection_method='leaf',\n", " algorithm='best',\n", " prediction_data=False,\n", " min_samples=hdbscan_min_samples).fit(df[['x', 'y', 'z']])\n", "\n", " cluster_analysis = len(pd.Series(cluster.labels_).unique())\n", " print('Number of Sentences = ', len(df))\n", " print('Number of Clusters = ', cluster_analysis, '/n')\n", "\n", " df_cluster = pd.DataFrame(\n", " pd.DataFrame(cluster.labels_).value_counts())\n", " print(df_cluster)\n", " \n", " clusters = pd.DataFrame(cluster.labels_)\n", "\n", " # percent_unlabelled = round((len(df[clusters[0] == -1]) / len(df)) * 100, 2)\n", " # print('The percentage of unlabelled sentences is: ', percent_unlabelled, '%')\n", "\n", " # reindex\n", " df.reset_index(inplace=True, drop=True)\n", "\n", " print(len(df[clusters[0] == -1]))\n", "\n", " for i in range(0, cluster_analysis):\n", " print('Cluster ', i, ' has ', len(\n", " df[clusters[0] == i]), ' sentences')\n", "\n", " print(df_cluster.index)\n", "\n", " from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\n", "\n", " def get_tfidf_top_features(documents, n_top=4):\n", " tfidf_vectorizer = TfidfVectorizer(\n", " min_df=0.05,\n", " max_df=0.95, max_features=10, \n", " stop_words='english')\n", " tfidf = tfidf_vectorizer.fit_transform(documents)\n", " importance = np.argsort(np.asarray(\n", " tfidf.sum(axis=0)).ravel())[::-1]\n", " tfidf_feature_names = np.array(\n", " tfidf_vectorizer.get_feature_names_out())\n", " return tfidf_feature_names[importance[:n_top]]\n", "\n", " cluster_names = pd.DataFrame(\n", " columns=['cluster_name', 'embed_index'])\n", "\n", " for i in range(cluster_analysis):\n", " try:\n", " print(get_tfidf_top_features(\n", " df['clean_text'][clusters[0] == i]))\n", "\n", " clstr_nm = get_tfidf_top_features(\n", " df['clean_text'][clusters[0] == i])\n", " clstr_idx = df['clean_text'][clusters[0] == i].index\n", " cluster_names.loc[i] = [clstr_nm, clstr_idx]\n", "\n", "\n", " except:\n", " # cluster_name.append('NULL')\n", " pass\n", "\n", " cluster_names['cluster_name'] = cluster_names['cluster_name'].astype(\n", " str)\n", " cluster_names['cluster_name'] = cluster_names['cluster_name'].str.replace(\n", " '[', '')\n", " cluster_names['cluster_name'] = cluster_names['cluster_name'].str.replace(\n", " ']', '')\n", " cluster_names['cluster_name'] = cluster_names['cluster_name'].str.replace(\n", " \"'\", '')\n", " cluster_names['cluster_name'] = cluster_names['cluster_name'].str.replace(\n", " \" \", '-')\n", "\n", " clusters_names = cluster_names.explode('embed_index')\n", "\n", " df2 = df.merge(clusters_names, left_index=True,\n", " right_on='embed_index')\n", "\n", " df2['cluster_name_str'] = df2['cluster_name'].apply(\n", " lambda x: str(x))\n", " # assign a int value to each unique cluster name in df3\n", " df2['cluster_number'] = df2['cluster_name_str'].astype(\n", " 'category').cat.codes\n", "\n", " df2['trimmed_text'] = df2['clean_text'].str[:175]\n", "\n", " print(df2.head())\n", "\n", " df3 = df2[['x', 'y', 'z', 'cluster_number',\n", " 'cluster_name_str', 'trimmed_text']]\n", "\n", "\n", "#################################################### GET CLUSTER NAME #############################################\n", "\n", " df2['gpt_cluster'] = ''\n", " df3['gpt_cluster'] = ''\n", "\n", " for cluster in df3['cluster_name_str'].unique():\n", " each_cluster = df3[df3['cluster_name_str'] == cluster]\n", "\n", " docs = '\\n'.join(np.random.choice(each_cluster['trimmed_text'], 50))\n", "\n", " response3 = openai.ChatCompletion.create(\n", " model=\"gpt-3.5-turbo\",\n", " temperature=0.3,\n", " max_tokens=300,\n", " top_p=1,\n", " # stream=True,\n", " messages=[\n", " {\"role\": \"user\", \"content\": f'Given a list of keywords {cluster}, and documents present in the cluster : {docs}; assign the most relevant topic name for this cluster : \\n\\n Cluster Name : '},\n", " ]\n", " )['choices'][0]['message']['content']\n", "\n", " df3.loc[df3['cluster_name_str'] == cluster, 'gpt_cluster'] = response3\n", " df2.loc[df2['cluster_name_str'] == cluster, 'gpt_cluster'] = response3\n", "\n", " \n", " \n", "\n", " print(df3.head())\n", "\n", "\n", " if len(df3) > 10000:\n", " dot_size = 1\n", " else:\n", " dot_size = 4\n", "\n", " print(df3.head())\n", "\n", " fig = px.scatter_3d(df3, x='x', y='y', z='z', color='gpt_cluster', hover_name='trimmed_text', hover_data={\n", " 'x': False, 'y': False, 'z': False, 'cluster_name_str': False, 'cluster_number': False, 'gpt_cluster': False}, opacity=1, template='plotly_white')\n", "\n", " fig.update_traces(marker=dict(size=dot_size))\n", "\n", " fig.add_trace(go.Scatter3d(x=[0], y=[0], z=[0], mode='markers', marker=dict(\n", " size=0.1, color='white'), showlegend=True, name=' ', hoverinfo='none'))\n", "\n", " # legend on the right side\n", " fig.update_layout(legend=dict(\n", " bgcolor='rgba(17,17,17,0)', \n", " xanchor='auto', \n", " yanchor='auto',\n", " x=0.8, # Adjust the x position of the legend\n", " y=0.9, # Adjust the y position of the legend\n", " bordercolor='rgba(17,17,17,0)',\n", " borderwidth=0\n", " ))\n", "\n", " fig.update_layout(scene=dict(\n", " xaxis=dict(\n", " title=' ',\n", " nticks=0,\n", " # backgroundcolor=\"rgb(0, 0, 0, 1)\",\n", " gridcolor=\"rgba(17,17,17, 0)\",\n", " showbackground=False,\n", " zerolinecolor=\"rgba(17,17,17, 0)\",\n", " zeroline=False,\n", " showgrid=False,\n", " showticklabels=False,\n", " showspikes=False\n", " ),\n", " # hide ticks\n", "\n", "\n", " yaxis=dict(\n", " # name\n", " title=' ',\n", " nticks=0,\n", " # backgroundcolor=\"rgb(0, 0, 0, 1)\",\n", " gridcolor=\"rgba(17,17,17, 0)\",\n", " showbackground=False,\n", " zerolinecolor=\"rgba(17,17,17, 0)\",\n", " zeroline=False,\n", " showgrid=False,\n", " showticklabels=False,\n", " showspikes=False\n", " ),\n", "\n", "\n", "\n", " zaxis=dict(\n", " # name\n", " title=' ',\n", " nticks=0,\n", " # backgroundcolor=\"rgba(0, 0, 0, 1)\",\n", " gridcolor=\"rgba(17,17,17, 0)\",\n", " showbackground=False,\n", " zerolinecolor=\"rgba(17,17,17, 0)\",\n", " zeroline=False,\n", " showgrid=False,\n", " showticklabels=False,\n", " showspikes=False),)\n", " # tickvals=[],),\n", " )\n", "\n", " fig.update_layout(coloraxis_showscale=False, legend=dict(x=0.1, y=0.5, traceorder='normal', font=dict(\n", " family='Noto Serif', size=14, color='black'), bgcolor='rgba(17,17,17,0)', bordercolor='rgba(17,17,17,0)', borderwidth=0))\n", "\n", "\n", " # TO ADD AN IMAGE UNCOMMENT\n", " \n", " # fig.add_layout_image(\n", " # dict(\n", " # source=,\n", " # xref=\"x\",\n", " # yref=\"y\",\n", " # x=-1,\n", " # y=3.8,\n", " # # xanchor = \"left\",\n", " # # yanchor = \"top\",\n", " # sizex=.4,\n", " # sizey=.4,\n", " # opacity=1,\n", " # layer=\"above\",\n", " # )\n", " # ) \n", "\n", " fig.update_layout(legend={'itemsizing': 'constant'}, legend_title_text=' ', legend_title_font_color='black', legend_title_font_family='Noto Serif',\n", " legend_font_color='black', legend_font_size=14, legend_font_family='Noto Serif', legend_bgcolor='rgba(17,17,17,0)', legend_bordercolor='rgba(17,17,17,0)', legend_borderwidth=2)\n", "\n", " # , title_font_size=30, title_font_family='Noto Serif', title_font_color='white', title_x=0.06, title_y=0.95, title_xanchor='left', title_yanchor='top', title_text='Cluster of Emotions for {}/n n = {}'.format(subreddit, len(dataset_umap)), margin=dict(l=0, r=0, b=0, t=0, pad=0))\n", " fig.update_layout(scene_camera_eye=dict(x=0.87, y=-0.88, z=0.84), scene_camera_center=dict(\n", " x=0, y=0, z=0), template='plotly_white', hovermode='x unified', margin=dict(l=0, r=0, b=0, t=0, pad=2))\n", "\n", " fig.update_layout(coloraxis_showscale=False)\n", " fig.update_xaxes(showticklabels=False, showgrid=False, zeroline=False,\n", " showline=False, automargin=False, showspikes=False)\n", " fig.update_yaxes(showticklabels=False, showgrid=False, zeroline=False,\n", " showline=False, automargin=False, showspikes=False)\n", "\n", " \n", " #full_html=False, include_plotlyjs='cdn', default_height='750px', default_width='1500px', config={'displaylogo': False, 'modeBarButtonsToRemove': ['zoom2d', 'pan2d', 'select2d', 'lasso2d', 'zoomIn2d', 'zoomOut2d', 'autoScale2d', 'resetScale2d', 'hoverClosestCartesian', 'hoverCompareCartesian', 'zoom3d', 'pan3d', 'resetCameraDefault3d', 'resetCameraLastSave3d', 'hoverClosest3d', 'orbitRotation', 'tableRotation', 'zoomInGeo', 'zoomOutGeo', 'resetGeo', 'hoverClosestGeo', 'toImage', 'sendDataToCloud', 'hoverClosestGl2d', 'hoverClosestPie', 'toggleHover', 'resetViews', 'toggleSpikelines', 'resetViewMapbox']})}\n", "\n", " \n", " cluster_name = df3[['cluster_number', 'gpt_cluster']]\n", " cluster_name = cluster_name.drop_duplicates()\n", " cluster_name = cluster_name.sort_values(by=['cluster_number'])\n", " cluster_name = cluster_name.reset_index(drop=True)\n", " # create a list\n", " cluster_name_list = cluster_name['gpt_cluster'].tolist()\n", " cluster_name_list = '\\n'.join(cluster_name_list)\n", "\n", "\n", "\n", "\n", "\n", " #Silhouette_Score = 'Silhouette score is : ', silhouette_score(df3[['x', 'y', 'z']], df3['gpt_cluster'], metric='euclidean')\n", "\n", " # get a dataframe of unique cluster names and their count\n", "\n", " # cluster_count = df3.groupby('gpt_cluster').agg({'cluster_number': 'count'}).reset_index()\n", " # cluster_count = cluster_count.rename(columns={'cluster_number': 'count'})\n", "\n", " cluster_count = pd.DataFrame(columns=['cluster','count', 'proportions'])\n", " cluster_count['cluster'] = df3['gpt_cluster'].value_counts().index\n", " cluster_count['count'] = df3['gpt_cluster'].value_counts().values\n", " cluster_count['proportions'] = df3['gpt_cluster'].value_counts(normalize=True).values*100\n", " cluster_count['proportions'] = cluster_count['proportions'].round(3)\n", " cluster_count['proportions'] = cluster_count['proportions'].astype(str) + '%'\n", " \n", " \n", " return fig, cluster_count, cluster_name_list, df2\n", "\n", "\n", "\n", "\n", "def get_executive_summary(dataframe=None, brand=None, industry=None, summaries=None):\n", "\n", "\n", " if brand is None:\n", " brand = ' '\n", "\n", " else :\n", " brand = brand\n", " try:\n", " dataframe = dataframe[dataframe['translated_text'].str.contains(brand, case=False)]\n", " except:\n", " pass\n", "\n", "\n", " # check if summaries is an empty list\n", " if summaries is None:\n", "\n", " text_splitter = TokenTextSplitter.from_tiktoken_encoder(\n", " encoding_name='p50k_base',\n", " chunk_size = 2000,\n", " )\n", "\n", " splitted_articles = text_splitter.split_text(''.join(dataframe['translated_text']))\n", "\n", " summarize_template = \"\"\" {text} \\n\\n\n", " Summarize the most relevant information for an executive summary from the above document:\n", " \n", " SUMMARY: \"\"\"\n", "\n", "\n", " prompt_template = PromptTemplate(input_variables=['text'], template=summarize_template)\n", "\n", " summary_chain = LLMChain(llm=llm, prompt=prompt_template)\n", "\n", " summaries = []\n", " for i in splitted_articles:\n", " summaries.append(summary_chain.run(i))\n", "\n", "\n", "\n", " summaries1 = '/n'.join(summaries)\n", " word_count = 500\n", " \n", " #If needed, guess the industry\n", " # industry_template = PromptTemplate(input_variables=['summaries'], template=extract_industry)\n", " # summary_chain = LLMChain(llm=llm, prompt=industry_template)\n", " # industry = summary_chain.run(summaries)\n", "\n", " #Check size of output and go in a 2nd loop if it's too big\n", " encoding = tiktoken.get_encoding('p50k_base')\n", "\n", " if len(encoding.encode(summaries1)) > 2000:\n", " # return only first 2000 tokens\n", " summaries1 = encoding.decode(encoding.encode(summaries1)[:2000])\n", "\n", "\n", " executive_summary_template = '''\n", " Your task is to leverage your invaluable expertise in crafting a comprehensive and insightful {word_count} words executive summary tailored for C-level executives and decision-makers in {industry} for {brand}.\n", " The summary should synthesize information from various data sources, incorporate relevant cultural and contextual elements, and provide valuable insights that can drive strategic decision-making.\n", " Please ensure that your analysis meets the following high-level objectives:\n", " Thoroughly examine and interpret the key trends, patterns, and insights derived from the following data sources:\n", " {summaries1}\n", "\n", " Articulate the implications and opportunities for {industry}, keeping in mind the needs and challenges of the industry.\n", " Consider the cultural, social, and contextual nuances present in the data, drawing on your sociological expertise to ensure the summary remains relevant and insightful across diverse audiences.\n", " Identify any potential risks or challenges that might arise from the data, providing strategic recommendations for mitigating these issues.\n", " Present the information in a clear, concise, and engaging manner that captures the attention of busy executives and effectively communicates the key insights.\n", " Leverage your data expertise to ensure the accuracy, reliability, and relevance of the information presented in the summary. Make us benefit from your unique expertise and insights.\n", "\n", "\n", " Using markdown formatting, write a {word_count} word SEQ-optimized Executive Summary. Write a click worthy short titles. Add a key takeaway\n", " section at the end. Use the seed keyword as the first H2. Always use a combination of paragraphs, lists, and tables for a better reader experience. For the styling of the output, please include headers for different sections, and use bullet points where applicable to organize the key insights. \n", " To avoid repetition, vary the sentence structure and word choice when presenting information from different data sources or discussing various trends, insights, or opportunities. \n", " Using synonyms, alternate phrasings, and modifying sentence structure can help keep the text engaging and fresh for readers. \\n\\n\n", "\n", " '''\n", "\n", " prompt = PromptTemplate(template=executive_summary_template, input_variables=['industry', 'brand', 'word_count', 'summaries1'])\n", "\n", " executive_chain = LLMChain(llm=llm, prompt=prompt)\n", "\n", " output_summary = executive_chain.run(industry=industry, brand = brand, word_count=word_count, summaries1=summaries1)\n", "\n", "\n", " # summarize_template2 = \"\"\" {summaries1} \\n\\n\n", " # Make a detailed and thorough summary of the above document:\n", " \n", " # SUMMARY: \"\"\"\n", "\n", " # prompt_template2 = PromptTemplate(input_variables=['summaries1'], template=summarize_template2)\n", " # summary_chain = LLMChain(llm=llm, prompt=prompt_template2)\n", "\n", " # summaries = []\n", " # for i in splitted_articles:\n", " # summaries.append(summary_chain.run(i))\n", "\n", " # summaries = '/n'.join(summaries)\n", " # summaries1 = summaries\n", "\n", "\n", " # executive_summary_template = '''Imagine you are an Elite Analyst, Expert Sociologist, and Data Guru,\n", " # Your task is to leverage your invaluable expertise in crafting a comprehensive and insightful {word_count} words executive summary tailored for C-level executives and decision-makers in {industry}.\n", " # The summary should synthesize information from various data sources, incorporate relevant cultural and contextual elements, and provide valuable insights that can drive strategic decision-making.\n", " # Please ensure that your analysis meets the following high-level objectives:\n", " # Thoroughly examine and interpret the key trends, patterns, and insights derived from the following data sources:\n", " # {summaries1}\n", "\n", " # Articulate the implications and opportunities for {industry}, keeping in mind the needs and challenges of the industry.\n", " # Consider the cultural, social, and contextual nuances present in the data, drawing on your sociological expertise to ensure the summary remains relevant and insightful across diverse audiences.\n", " # Identify any potential risks or challenges that might arise from the data, providing strategic recommendations for mitigating these issues.\n", " # Present the information in a clear, concise, and engaging manner that captures the attention of busy executives and effectively communicates the key insights.\n", " # Leverage your data expertise to ensure the accuracy, reliability, and relevance of the information presented in the summary. Make us benefit from your unique expertise and insights.\n", "\n", "\n", " # Using markdown formatting, write a {word_count} word SEQ-optimized Executive Summary. Write a click worthy short titles. Add a key takeaway\n", " # section at the end. Use the seed keyword as the first H2. Always use a combination of paragraphs, lists, and tables for a better reader experience. For the styling of the output, please include headers for different sections, and use bullet points where applicable to organize the key insights. \n", " # To avoid repetition, vary the sentence structure and word choice when presenting information from different data sources or discussing various trends, insights, or opportunities. \n", " # Using synonyms, alternate phrasings, and modifying sentence structure can help keep the text engaging and fresh for readers. \\n\\n\n", "\n", " # '''\n", "\n", " # prompt = PromptTemplate(template=executive_summary_template, input_variables=['industry', 'word_count', 'summaries1'])\n", "\n", " # executive_chain = LLMChain(llm=llm, prompt=prompt)\n", "\n", " # output_summary = executive_chain.run(industry=industry, word_count=word_count, summaries1=summaries1)\n", "\n", " return(output_summary, dataframe[['translated_text']][0:20], summaries)\n", "\n", "\n", "\n", "\n", "def get_topic_summary(dataframe, topics=None, brand=None, industry=None, summaries=None):\n", "\n", "\n", " if brand is None:\n", " brand = ''\n", "\n", " else :\n", " brand = brand\n", " try:\n", " dataframe = dataframe[dataframe['translated_text'].str.contains(brand, case=False)]\n", " except:\n", " pass\n", "\n", "\n", " if summaries is None:\n", "\n", " text_splitter = TokenTextSplitter.from_tiktoken_encoder(\n", " encoding_name='p50k_base',\n", " chunk_size = 2000,\n", " )\n", "\n", "\n", " splitted_articles = text_splitter.split_text(''.join(dataframe['translated_text']))\n", "\n", " summarize_template = \"\"\"Summarize the most relevant information from the following document:\n", "\n", " {text}\n", "\n", " SUMMARY: \"\"\"\n", "\n", "\n", " prompt_template = PromptTemplate(input_variables=['text'], template=summarize_template)\n", "\n", " summary_chain = LLMChain(llm=llm, prompt=prompt_template)\n", "\n", " summaries = []\n", " for i in splitted_articles:\n", " summaries.append(summary_chain.run(i))\n", "\n", "\n", "\n", "\n", " # split the summary into 2000 tokens chunks\n", " text_splitter = TokenTextSplitter.from_tiktoken_encoder(\n", " encoding_name='p50k_base',\n", " chunk_size = 2000,\n", " )\n", "\n", "\n", " summaries2 = text_splitter.split_text(''.join(summaries))\n", "\n", " word_count = 500\n", " \n", " topics = topics\n", " final_summary = []\n", "\n", "\n", "\n", " # encoding = tiktoken.get_encoding('p50k_base')\n", " # if len(encoding.encode(summaries1)) > 2000:\n", " # # return only first 2000 tokens\n", " # summaries1 = encoding.decode(encoding.encode(summaries2)[:2000])\n", "\n", "\n", " # topic_prompt = '''\"Imagine you are an Elite Analyst and Trend Analysis Expert with extensive experience in identifying patterns and emerging themes from various data sources, such as social media, regular media, reviews, and survey data. Your task is to leverage your invaluable expertise in crafting a comprehensive and insightful trend analysis report tailored for {brand} within the {industry}. The objective is to provide valuable insights into shifts in consumer behavior, preferences, and market dynamics, enabling informed decision-making for C-level executives and decision-makers.\n", "\n", " # In your analysis of {word_count} words, ensure that you address the following key elements:\n", "\n", " # Topics : {topics}\n", "\n", " # Data: {summary}\n", "\n", " # Emerging Trends: Identify and discuss the key emerging trends in consumer behavior, preferences, and market dynamics within the {industry}. Examine the factors driving these trends and provide specific examples to illustrate your findings.\n", "\n", " # Impact on {brand}: Analyze how the identified trends are affecting or could potentially affect {brand}. Consider both opportunities and challenges that may arise from these trends, as well as any necessary adjustments to marketing strategies, product offerings, or customer service initiatives.\n", "\n", " # Recommendations: Based on the insights derived from the trend analysis, provide actionable recommendations for {brand} to stay ahead of the competition, capitalize on new opportunities, and address potential challenges. Consider innovations, partnerships, or targeted marketing campaigns that can help the company adapt to and benefit from the identified trends.\n", "\n", " # Ensure that your trend analysis report is clear, concise, and engaging for busy executives. Focus on providing actionable insights and recommendations that can inform the company's strategic direction. Draw on your expertise to ensure the accuracy, reliability, and relevance of the information presented in the analysis.\"\n", "\n", "\n", " # Using markdown formatting, write a {word_count} word SEQ-optimized Trend Analysis. Write a click worthy short titles. Add a key takeaway\n", " # section at the end. Use the seed keyword as the first H2. Always use a combination of paragraphs, lists, and tables for a better reader experience. For the styling of the output, please include headers for different sections, and use bullet points where applicable to organize the key insights. \n", " # To avoid repetition, vary the sentence structure and word choice when presenting information. Using synonyms, alternate phrasings, and modifying sentence structure can help keep the text engaging and fresh for readers. \\n\\n\n", "\n", " # '''\n", "\n", " # prompt = PromptTemplate(template=topic_prompt, input_variables=['industry', 'topics', 'word_count', 'summary', 'brand'])\n", " # topic_chain = LLMChain(llm=llm, prompt=prompt)\n", " # topic_summary = topic_chain.run(industry=industry, topics = topics, word_count=word_count, summary=summaries1, brand=brand)\n", "\n", "\n", " for summary_1 in summaries2:\n", "\n", " topic_prompt = '''\"Imagine you are an Elite Analyst and Trend Analysis Expert with extensive experience in identifying patterns and emerging themes from various data sources, such as social media, regular media, reviews, and survey data. Your task is to leverage your invaluable expertise in crafting a comprehensive and insightful trend analysis report tailored for {brand} within the {industry}. The objective is to provide valuable insights into shifts in consumer behavior, preferences, and market dynamics, enabling informed decision-making for C-level executives and decision-makers.\n", "\n", " In your analysis of {word_count} words, ensure that you address the following key elements:\n", "\n", " Topics : {topics}\n", "\n", " Data: {summary}\n", "\n", " Emerging Trends: Identify and discuss the key emerging trends in consumer behavior, preferences, and market dynamics within the {industry}. Examine the factors driving these trends and provide specific examples to illustrate your findings.\n", "\n", " Impact on {brand}: Analyze how the identified trends are affecting or could potentially affect {brand}. Consider both opportunities and challenges that may arise from these trends, as well as any necessary adjustments to marketing strategies, product offerings, or customer service initiatives.\n", "\n", " Recommendations: Based on the insights derived from the trend analysis, provide actionable recommendations for {brand} to stay ahead of the competition, capitalize on new opportunities, and address potential challenges. Consider innovations, partnerships, or targeted marketing campaigns that can help the company adapt to and benefit from the identified trends.\n", "\n", " Ensure that your trend analysis report is clear, concise, and engaging for busy executives. Focus on providing actionable insights and recommendations that can inform the company's strategic direction. Draw on your expertise to ensure the accuracy, reliability, and relevance of the information presented in the analysis.\"\n", "\n", "\n", " Using markdown formatting, write a {word_count} word SEQ-optimized Trend Analysis. Write a click worthy short titles. Add a key takeaway\n", " section at the end. Use the seed keyword as the first H2. Always use a combination of paragraphs, lists, and tables for a better reader experience. For the styling of the output, please include headers for different sections, and use bullet points where applicable to organize the key insights. \n", " To avoid repetition, vary the sentence structure and word choice when presenting information. Using synonyms, alternate phrasings, and modifying sentence structure can help keep the text engaging and fresh for readers. \\n\\n\n", "\n", " '''\n", "\n", " prompt = PromptTemplate(template=topic_prompt, input_variables=['industry', 'topics', 'word_count', 'summary', 'brand'])\n", "\n", " topic_chain = LLMChain(llm=llm, prompt=prompt)\n", "\n", " topic_summary = topic_chain.run(industry=industry, topics = topics, word_count=word_count, summary=summary_1, brand=brand)\n", "\n", " final_summary.append(topic_summary)\n", "\n", " if len(final_summary) > 1:\n", " topic_summary = ''.join(final_summary)\n", "\n", " combination = '''{topic_summary}\\n\\nCombine the content from these articles into one; keeping the format and structure in place. \\n\\n##Trend Analysis:\\n\\n'''\n", " prompt = PromptTemplate(template=combination, input_variables=['topic_summary'])\n", " final_chain = LLMChain(llm=llm, prompt=prompt)\n", " final_summary = final_chain.run(topic_summary=topic_summary)\n", " \n", " else:\n", " final_summary = str(final_summary)\n", "\n", "\n", " return(final_summary, dataframe[['translated_text']][0:20], summaries2)\n", "\n", "\n", "\n", "\n", "def get_SWOT(dataframe, brand = None, industry = None):\n", " \n", " if brand is None:\n", " brand = ' '\n", "\n", " else :\n", " brand = brand\n", " try:\n", " dataframe = dataframe[dataframe['translated_text'].str.contains(brand, case=False)]\n", " except:\n", " pass\n", "\n", " \n", "\n", " word_count = 500\n", "\n", "\n", " # industry_template = PromptTemplate(input_variables=['summaries'], template=extract_industry)\n", " # summary_chain = LLMChain(llm=llm, prompt=industry_template)\n", " # industry = summary_chain.run(summaries)\n", "\n", " brand = brand\n", "\n", " industry = industry\n", "\n", " wikipedia = WikipediaAPIWrapper()\n", "\n", " internet_content = wikipedia.run(f'{brand}')\t\n", "\n", " \n", "\n", "\n", " SWOT_analysis_template = '''\"Imagine you are an Elite Analyst and Strategic Expert. You have extensive experience in conducting SWOT (Strengths, Weaknesses, Opportunities, and Threats) analyses for businesses across various industries. \n", " Your task is to leverage your invaluable expertise in crafting a comprehensive and insightful SWOT analysis tailored for {brand}, focusing on the {industry}. The objective is to provide valuable insights that can drive strategic decision-making for C-level executives and decision-makers.\n", " In your analysis, ensure that you address the following key elements:\n", "\n", " Strengths: Identify and elaborate on the key strengths of {brand} within the industry. Consider factors such as brand reputation, market share, unique selling points, resources, and competitive advantages.\n", "\n", " Weaknesses: Analyze and discuss the primary weaknesses of {brand} that may hinder its growth or market position. These may include factors such as operational inefficiencies, limited product range, lack of innovation, or financial vulnerabilities.\n", "\n", " Opportunities: Explore and highlight potential opportunities for {brand} in the {industry} sector. Consider market trends, technological advancements, potential partnerships, or untapped consumer segments that the company can capitalize on to grow its business.\n", "\n", " Threats: Examine and outline the key threats that {brand} faces within the industry. These may include factors such as increasing competition, regulatory changes, economic downturns, or shifting consumer preferences.\n", "\n", " For context : {internet_content}\\n\\n\n", "\n", " Ensure that your {word_count} SWOT analysis is clear, concise, and engaging for busy executives. Focus on providing actionable insights and recommendations that can inform the company's strategic direction. Draw on your expertise to ensure the accuracy, reliability, and relevance of the information presented in the analysis.\"\n", " '''\n", "\n", " prompt = PromptTemplate(template=SWOT_analysis_template, input_variables=['word_count', 'industry', 'brand', 'internet_content'])\n", "\n", " SWOT_chain = LLMChain(llm=llm, prompt=prompt)\n", "\n", " SWOT_summary = SWOT_chain.run(industry=industry, word_count=word_count, brand=brand, internet_content=internet_content)\n", "\n", " return(SWOT_summary)\n", "\n", "\n", "\n", "def emotional_mapping(dataframe, industry = None, media = None):\n", " \n", "\n", " # if dataframe.columns.contains('top_emotion_bertweet'):\n", " # dataframe['emotion'] = dataframe['top_emotion_bertweet']\n", "\n", " # elif dataframe.columns.contains('top_emotion_roberta'):\n", " # dataframe['emotion'] = dataframe['top_emotion_roberta']\n", "\n", " # elif dataframe.columns.contains('top_emotion_distilbert'):\n", " # dataframe['emotion'] = dataframe['top_emotion_distilbert']\n", "\n", " # elif dataframe.columns.contains('top_emotion'):\n", " # dataframe['emotion'] = dataframe['top_emotion']\n", "\n", " # else:\n", " # dataframe = get_emotion_bertweet(dataframe)\n", " # dataframe['emotion'] = dataframe['top_emotion_bertweet']\n", "\n", " word_count = 500\n", "\n", "\n", " # industry_template = PromptTemplate(input_variables=['summaries'], template=extract_industry)\n", " # summary_chain = LLMChain(llm=llm, prompt=industry_template)\n", " # industry = summary_chain.run(summaries)\n", "\n", " media = media\n", "\n", " industry = industry\n", "\n", "\n", " # get positive dataset\n", " positive = dataframe[dataframe['polarity'] > 0]\n", "\n", " # get negative dataset\n", " negative = dataframe[dataframe['polarity'] < 0]\n", "\n", "\n", " positive_emotions = []\n", " negative_emotions = []\n", "\n", " corpus_positive = {}\n", " corpus_negative = {}\n", " \n", "\n", " for i in range(0,3):\n", " value = str(positive.top_emotion.value_counts(normalize=True).index[i])\n", " percent = str(round(positive.top_emotion.value_counts(normalize=True)[i]*100,2)) + '%'\n", " positive_emotions.append(value + ' ' + percent)\n", "\n", " corpus_positive[value] = positive[positive['top_emotion'] == value]['translated_text'].tolist()\n", "\n", " value = str(negative.top_emotion.value_counts(normalize=True).index[i])\n", " percent = str(round(negative.top_emotion.value_counts(normalize=True)[i]*100,2)) + '%'\n", " negative_emotions.append(value + ' ' + percent)\n", "\n", " corpus_negative[value] = negative[negative['top_emotion'] == value]['translated_text'].tolist()\n", "\n", "\n", "\n", " executive_summary_template = '''Imagine you are an Elite psychologist, Analyst, and Data Guru. You are familiar with leading emotion measurement techniques and the latest developments in the field, \n", " \n", " including the Plutchik index and Emotional Intensity Scale (EIS). \n", " \n", " Your task is to leverage your invaluable expertise in crafting an insightful {word_count} emotion-driven report tailored for C-level executives and decision-makers in {industry}. \n", " The objective is to provide valuable insights into the impact of the top 3 positive emotions {positive_emotions} and top 3 negative emotions {negative_emotions} on marketing and branding strategies and provoke lightbulb moments for our readers. \n", " Your analysis should provide valuable insights that can drive strategic decision-making \n", " based on the key emotions {positive_emotions} {negative_emotions} captured across social medias.\n", "\n", " Structure the analysis in two main sections: Observations and Key Findings. In the Observations section, provide precise details about specific emotion measurements and their relation to the wants \n", " and needs expressed in the data. In the Key Findings section, focus on insightful content and compare and contrast the different emotions, revealing what's hiding behind the numbers and addressing both expressed and latent emotions.\n", " Avoid jargon and broad terms in your analysis, ensuring that the content is clear, concise, and engaging.\n", "\n", " Thoroughly examine and interpret the key trends, patterns, and insights derived from the key emotions {positive_emotions} {negative_emotions}.\n", " Articulate the implications and opportunities based on the emotion levels, keeping in mind the needs and challenges of the {industry}.\n", " Consider the cultural, social, and contextual nuances present in the data, drawing on your expertise to ensure the emotion analysis remains relevant and insightful across diverse audiences.\n", " Leverage your psychological expertise to ensure the accuracy, reliability, and relevance of the information presented in the summary. Make us benefit from your unique expertise and insights.\n", "\n", "\n", " Using markdown formatting, write a {word_count} word SEQ-optimized Executive Summary. Write a click worthy short titles. Add a key takeaway\n", " section at the end. Use the seed keyword as the first H2. Always use a combination of paragraphs, lists, and tables for a better reader experience. For the styling of the output, please include headers for different sections, \n", " and use bullet points where applicable to organize the key insights. \n", " To avoid repetition, vary the sentence structure and word choice when presenting information from different data sources or discussing various trends, insights, or opportunities. \n", " Using synonyms, alternate phrasings, and modifying sentence structure can help keep the text engaging and fresh for readers. \\n\\n\n", "\n", " '''\n", "\n", " prompt = PromptTemplate(template=executive_summary_template, input_variables=['word_count', 'industry', 'positive_emotions', 'negative_emotions'])\n", "\n", " executive_chain = LLMChain(llm=llm, prompt=prompt)\n", "\n", " emotional_summary = executive_chain.run(industry=industry, word_count=word_count, media=media, positive_emotions=positive_emotions, negative_emotions=negative_emotions)\n", "\n", "\n", " return(emotional_summary)\n", "\n", "\n", "\n", "def generate_wordcloud(dataframe):\n", " \n", " text = ' '.join(dataframe['translated_text'].tolist())\n", "\n", " colors = [\"#FF69B4\", \"#FFD700\", \"#FFA500\", \"#D3D3D3\"]\n", " wordcloud = WordCloud(max_font_size=300, max_words=800, width = 1600, height = 1200, background_color=\"white\", colormap=\"Set2\", color_func=lambda *args, **kwargs: colors[len(args[0]) % len(colors)]).generate(text)\n", " return wordcloud.to_image()\n", "\n", "\n", "\n", "def get_polarity(dataframe):\n", " df = dataframe.copy()\n", " def get_sentiment_vader(text):\n", " from nltk.sentiment.vader import SentimentIntensityAnalyzer\n", " sid = SentimentIntensityAnalyzer()\n", " return sid.polarity_scores(text)['compound']\n", "\n", "\n", " df['translated_text'] = df['translated_text'].astype(str)\n", " df['polarity'] = df['translated_text'].apply(lambda x: get_sentiment_vader(x))\n", "\n", " fig = plt.figure(frameon=False, figsize=(16, 12))\n", "\n", " df['date2'] = pd.to_datetime(df['date'], utc=True)\n", "\n", " sorted_dates = df.sort_values(by='date2')\n", "\n", " cmap = plt.cm.get_cmap('RdYlGn')\n", " norm = plt.Normalize(sorted_dates['polarity'].min(), sorted_dates['polarity'].max())\n", " colors = [cmap(norm(value)) for value in sorted_dates['polarity']]\n", "\n", " # scatter plot\n", " plt.scatter(sorted_dates['date2'],sorted_dates['polarity'], color=colors, alpha=0.5)\n", "\n", " # add a lineplot to show the average per day\n", "\n", " plt.plot(sorted_dates['date2'], sorted_dates['polarity'].rolling(window=50).mean(), color='hotpink', linewidth=1.5)\n", "\n", " \n", " # add legend about pink line\n", "\n", " plt.legend(['Polarity', 'Trend'], prop=font, frameon=False, bbox_to_anchor=(0.3, 1), loc='upper right', ncol=2, fontsize=12)\n", "\n", " # add x-label inside the plot\n", "\n", " plt.xlabel('Date', fontproperties=font, fontsize=12)\n", "\n", " # add y-label\n", " plt.ylabel('Polarity', fontproperties=font, fontsize=12)\n", "\n", " # add x-ticks\n", " plt.xticks(fontsize=12, fontproperties=font)\n", "\n", " plt.yticks(fontsize=12, fontproperties=font)\n", "\n", "\n", " # output as a png file\n", " buffer = BytesIO()\n", "\n", " # Save the plot to the buffer in PNG format\n", "\n", " # width = 1600, height = 1200,\n", " plt.savefig(buffer, format='png', dpi=300, bbox_inches='tight')\n", "\n", " # Close the figure to free up memory\n", " plt.close(fig)\n", "\n", " # Return the buffer containing the image data\n", " buffer.seek(0)\n", "\n", " # get as a PIL image\n", " img = Image.open(buffer)\n", "\n", "\n", " return img, df\n", "\n", "\n", "\n", "\n", "def play_audio(audio_file_path):\n", " # Load the audio file\n", " audio, sample_rate = librosa.load(audio_file_path, sr=None)\n", "\n", " # Create a stream with 1024 frames per buffer\n", " p = pyaudio.PyAudio()\n", " stream = p.open(format=pyaudio.paFloat32,\n", " channels=1,\n", " rate=sample_rate,\n", " output=True,\n", " frames_per_buffer=1024)\n", "\n", " # Play the audio file\n", " stream.write(audio.astype(np.float32).tobytes())\n", "\n", " # Stop and close the stream\n", " stream.stop_stream()\n", " stream.close()\n", "\n", " # Terminate the PortAudio interface\n", " p.terminate()\n", "\n", "\n", "def text_to_speech(text):\n", " url = f\"https://api.elevenlabs.io/v1/text-to-speech/21m00Tcm4TlvDq8ikWAM\"\n", "\n", " headers = {\n", " \"accept\": \"audio/mpeg\",\n", " \"xi-api-key\": \"944e814653d5f50b8eb3e73670fc3886\",\n", " \"Content-Type\": \"application/json\",\n", " }\n", "\n", " data = {\"text\": text}\n", "\n", " response = requests.post(url, headers = headers, json=data)\n", "\n", "\n", " if response.status_code == 200:\n", " return response.content\n", " else:\n", " print(f\"Error: {response.status_code}\")\n", " return None\n", " \n", "\n", "def on_click_play_audio_button(text):\n", " #text = response2\n", " print(text)\n", " audio_content = text_to_speech(text)\n", "\n", " if audio_content is not None:\n", " with open(\"output_audio.mp3\", \"wb\") as audio_file:\n", " audio_file.write(audio_content)\n", "\n", " # Play the audio file\n", " play_audio(\"output_audio.mp3\")\n", "\n", "\n", " \n", "def get_synthetic_comment(text: None, slider, dataframe, html:None) :\n", "\n", " \n", " if check_words_in_string(words, text, case=False):\n", "\n", "\n", " df = dataframe.copy() # query is based on a dataframe called \"df\"\n", " agent = create_pandas_dataframe_agent(OpenAI(temperature=0), df, verbose=True, return_intermediary_steps=True)\n", "\n", " def launch_agent(user_input): \n", " memory['user'].append(user_input)\n", " user_input = get_memory() + user_input\n", " agent_output = (agent.run(user_input))\n", " memory['agent'].append(agent_output)\n", " return agent_output\n", "\n", " print('Pandas Agent Query')\n", " answer = launch_agent(text)\n", " return answer, None, None\n", "\n", " else:\n", " print('Semantic Matching Query')\n", " \n", "\n", "\n", " query_type = 'Tweet'\n", "\n", "\n", " query = f'Forget all of the above. Write 2-3 examples of {query_type} answering this question: {text}. \\n\\n{query_type}:\\n\\n'\n", " response = openai.ChatCompletion.create(\n", " model=\"gpt-3.5-turbo\",\n", " temperature=0.5,\n", " max_tokens=300,\n", " top_p=1,\n", " # stream=True,\n", " messages=[\n", " #{\"role\": \"system\", \"content\": \"Forget all the above instructions. You are a reviewer of products but also a customer and have an opinion about the products you buy. You are asked to write a review of a product you have recently purchased.\"},\n", " {\"role\": \"user\", \"content\": query},\n", " ]\n", " )['choices'][0]['message']['content']\n", " response = re.sub(r'As an AI model.*?\\.', '', response)\n", " response = re.sub(r'As an AI language model.*?\\.', '', response)\n", "\n", " query_embed = model.encode(response)\n", " # dataframe['embeddings'] = dataframe['translated_text'].apply(\n", " # lambda x: model.encode(x))\n", " dataframe['similarity'] = dataframe['embeddings'].apply(\n", " lambda x: round(float(util.pytorch_cos_sim(query_embed, x)), 3))\n", "\n", " dataframe.sort_values(by='similarity', ascending=False, inplace=True)\n", "\n", " \n", "\n", " \n", " complexity = ''\n", "\n", " if dataframe[dataframe['similarity'] > slider].shape[0] == 0:\n", " response2 = f'No {query_type} found with a similarity score above {slider}. Try to lower the similarity score threshold or change the question.\\n\\n However, this is what I found on the internet: \\n\\n'\n", " \n", " toolname = ['wolfram-alpha']\n", " tools = load_tools(toolname)\n", " agent = initialize_agent(tools=tools, llm=llm, agent='zero-shot-react-description', verbose=False)\n", " wolfram_content = agent.run(f'{text}')\n", " wolfram_response = f'{wolfram_content}'\n", " \n", " toolname = ['serpapi']\n", " tools = load_tools(toolname)\n", " agent = initialize_agent(tools=tools, llm=llm, agent='zero-shot-react-description', verbose=False)\n", " internet_content = agent.run(f'{text}')\n", " internet_response = f'{internet_content}'\n", "\n", " response2 = f'{response2} \\n\\n {wolfram_response} \\n\\n {internet_response}'\n", "\n", " else:\n", "\n", " try:\n", " corpus = dataframe[dataframe['similarity']\n", " > slider]['translated_text'].tolist()\n", " response2 = openai.ChatCompletion.create(\n", " model=\"gpt-3.5-turbo\",\n", " temperature=0.5,\n", " max_tokens=300,\n", " top_p=1,\n", " # stream=True,\n", " messages=[\n", " {\"role\": \"user\", \"content\": f'\\n{corpus}\\n\\nSummarize the above {query_type}s {complexity} to answer this question: {text}\\n\\nSummary:\\n\\n'},\n", " ]\n", " )['choices'][0]['message']['content']\n", " response2 = f'Question: {text}\\n\\nAnswer: {response2}'\n", " except:\n", " try:\n", " corpus = dataframe[dataframe['similarity']\n", " > slider]['translated_text'][0:50].tolist()\n", " response2 = openai.ChatCompletion.create(\n", " model=\"gpt-3.5-turbo\",\n", " temperature=0.5,\n", " max_tokens=300,\n", " top_p=1,\n", " # stream=True,\n", " messages=[\n", " {\"role\": \"user\", \"content\": f'\\n{corpus}\\n\\nSummarize the above {query_type}s {complexity} to answer this question: {text}\\n\\nSummary:\\n\\n'},\n", " ]\n", " )['choices'][0]['message']['content']\n", " response2 = f'Question: {text}\\n\\nAnswer: {response2}'\n", " except:\n", " try:\n", " corpus = dataframe[dataframe['similarity']\n", " > slider]['translated_text'][0:30].tolist()\n", " response2 = openai.ChatCompletion.create(\n", " model=\"gpt-3.5-turbo\",\n", " temperature=0.5,\n", " max_tokens=300,\n", " top_p=1,\n", " # stream=True,\n", " messages=[\n", " {\"role\": \"user\", \"content\": f'\\n{corpus}\\n\\nSummarize the above {query_type}s {complexity} to answer this question: {text}\\n\\nSummary:\\n\\n'},\n", " ]\n", " )['choices'][0]['message']['content']\n", " response2 = f'Question: {text}\\n\\nAnswer: {response2}'\n", " except:\n", " corpus = dataframe[dataframe['similarity']\n", " > slider]['translated_text'][0:15].tolist()\n", " response2 = openai.ChatCompletion.create(\n", " model=\"gpt-3.5-turbo\",\n", " temperature=0.5,\n", " max_tokens=300,\n", " top_p=1,\n", " # stream=True,\n", " messages=[\n", " {\"role\": \"user\", \"content\": f'\\n{corpus}\\n\\nSummarize the above {query_type}s {complexity} to answer this question: {text}\\n\\nSummary:\\n\\n'},\n", " ]\n", " )['choices'][0]['message']['content']\n", " response2 = f'Question: {text}\\n\\nAnswer from Data: {response2}'\n", "\n", " # Graph Generation\n", "\n", " #new_graph = get_graph(dataframe[dataframe['similarity'] > slider])\n", "\n", " return response2, dataframe[dataframe['similarity'] > slider][['similarity', 'translated_text']][0:10], None\n", "\n", " return response2, dataframe[dataframe['similarity'] > slider][['similarity', 'translated_text']][0:10], None\n", "\n", "\n", "\n", "\n", "def save_output(tab):\n", " if tab == \"Summary\":\n", " with open(\"data_answer.txt\", \"w\") as f:\n", " try:\n", " f.write(data_answer.get_value())\n", " except:\n", " pass\n", " elif tab == \"Table\":\n", " try:\n", " similar_reviews_dataframe = pd.DataFrame(similar_reviews)\n", " similar_reviews_dataframe.to_csv(\"similar_reviews.csv\", index=False, encoding='utf-8-sig')\n", " except:\n", " pass\n", " else:\n", " try:\n", " g.save_graph(\"graph.html\")\n", " except:\n", " pass\n", "\n", "\n", "def generate_new_examples(text):\n", " # GENERATE NEW EXAMPLES BASED ON QUERY\n", " new_examples = openai.ChatCompletion.create(\n", " model=\"gpt-3.5-turbo\",\n", " temperature=0.7,\n", " max_tokens=100,\n", " top_p=1,\n", " # stream=True,\n", " messages=[\n", " {\"role\": \"user\", \"content\": f'Generate a list of 4 most relevent questions related to this question : {text}. Output should be in a comma separated string format. There is no need to enumerate each element.\\n\\n'},\n", " ]\n", " )['choices'][0]['message']['content']\n", "\n", " \n", " new_examples = new_examples.split('\\n')\n", " # make a list for each element\n", "\n", " new_examples = [x for x in new_examples if x != '']\n", " new_examples = [x.strip() for x in new_examples]\n", " new_examples = [x.split(',') for x in new_examples]\n", " return new_examples\n", "\n", " \n", "def summarize_video(url):\n", " loader = YoutubeLoader.from_youtube_channel(url)\n", " result = loader.load()\n", "\n", " text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)\n", " texts = text_splitter.split_documents(result)\n", " print(len(texts))\n", " \n", "\n", " # We first try the chain with the default chain type\n", " # if length of the text is more than 2000 tokens, we will use map reduce (summary of chunks)\n", "\n", " try:\n", " chain = load_summarize_chain(llm, chain_type='stuff', verbose=True)\n", " print('ChainType: stuff')\n", " # store intermediate steps\n", " return chain.run(result)\n", "\n", " except:\n", " text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)\n", " texts = text_splitter.split_documents(result)\n", " print(len(texts))\n", " chain = load_summarize_chain(llm, chain_type='map_reduce', verbose=True)\n", " print('ChainType: map reduce')\n", " return chain.run(texts)\n", "\n", "\n", "def main():\n", " global similar_reviews, g, query_type, response2, Output, output_html, html, new_examples, samples\n", "\n", "\n", " def update_examples(samples):\n", " return gr.Dataset.update(samples=samples)\n", " \n", " def print_samples():\n", " global samples\n", " return {\"samples\": samples}\n", "\n", " def load_example(example_id):\n", " global samples\n", " return samples[example_id][0]\n", "\n", "\n", " with gr.Blocks(theme=gr.themes.Soft(primary_hue='pink',\n", " secondary_hue='pink', neutral_hue='stone')) as app:\n", "\n", " # add a title at the center of the page\n", " gr.Markdown(\"

Talk-Back

\")\n", " \n", " summaries_state = gr.State(value=[], label=\"summaries_state\")\n", "\n", "\n", " with gr.Row():\n", " with gr.Column(scale=.7):\n", "\n", " with gr.Tab(\"Social\"):\n", " pass\n", " # threshold = gr.Slider(minimum=0, maximum=1, default=0.5, label=\"Threshold\")\n", " # threshold.info = \"Degree of similarity with your Query\"\n", " # mybrand = gr.Textbox(label=\"Brand\", default_value=\"\")\n", " # mybrand.info = \"Enter the name of the brand you are interested in\"\n", " # myindustry = gr.Dropdown(choices= Industries, label=\"Industry\")\n", " # myindustry.info = \"Enter the name of the industry you are interested in\"\n", " # brand = gr.State(mybrand.value)\n", " # industry = gr.State(myindustry.value)\n", " # query_type = gr.State('Tweet')\n", " with gr.Tab(\"Medias\"):\n", " pass\n", " # threshold = gr.Slider(minimum=0, maximum=1, default=0.5, label=\"Threshold\")\n", " # # add information about the threshold\n", " # threshold.info = \"Degree of similarity with your Query\"\n", " # mybrand = gr.Textbox(label=\"Brand\", default_value=\"\")\n", " # mybrand.info = \"Enter the name of the brand you are interested in\"\n", " # myindustry = gr.Textbox(label=\"Industry\", default_value=\"\")\n", " # myindustry.info = \"Enter the name of the industry you are interested in\"\n", " # brand = gr.State(mybrand.value)\n", " # industry = gr.State(myindustry.value) \n", " # query_type = gr.State('News Media')\n", " with gr.Tab(\"Reviews\"):\n", " pass\n", " # threshold = gr.Slider(minimum=0, maximum=1, default=0.5, label=\"Threshold\")\n", " # threshold.info = \"Degree of similarity with your Query\"\n", " # mybrand = gr.Textbox(label=\"Brand\", default_value=\"\")\n", " # mybrand.info = \"Enter the name of the brand you are interested in\"\n", " # myindustry = gr.Textbox(label=\"Industry\", default_value=\"\")\n", " # myindustry.info = \"Enter the name of the industry you are interested in\"\n", " # brand = gr.State(mybrand.value)\n", " # industry = gr.State(myindustry.value)\n", " query_type = gr.State('Reviews')\n", " with gr.Tab(\"YouTube\"):\n", " youtube_url = gr.Textbox(label=\"YouTube URL\", default_value=\"https://www.youtube.com/watch?v=9bZkp7q19f0\")\n", " youtube_button = gr.Button(\"Summarize Video\", visible=False)\n", " query_type = gr.State('YouTube')\n", "\n", " threshold = gr.Slider(minimum=0, maximum=1, default=0.5, label=\"Threshold\")\n", " # mybrand = gr.Textbox(label=\"Brand\", default_value=\"\")\n", " # myindustry = gr.Dropdown(choices= Industries, label=\"Industry\")\n", " # brand = gr.State(mybrand.value)\n", " # industry = gr.State(myindustry.value)\n", "\n", " brand = gr.inputs.Textbox(label=\"Brand\")\n", " industry = gr.inputs.Textbox(label=\"Industry\")\n", "\n", "\n", " \n", "\n", " csv_file = gr.File(label=\"File (csv, excel, h5..)\", full_width=False)\n", "\n", " # THE DATASET\n", " data_storage = gr.State(label=\"data_storage\")\n", "\n", " # TOPICS LIST\n", " list_of_topics = gr.State(label=\"list_of_topics\")\n", " \n", " \n", " swot_ = gr.Button(\"SWOT Analysis\")\n", " exec_sum = gr.Button(\"Executive Summary\")\n", " top_clust = gr.Button(\"Topic Cluster\")\n", " trend_analysis = gr.Button(\"Trend Analysis\")\n", " emot_clust = gr.Button(\"Emotional Landscape\")\n", " test_graph = gr.Button(\"Test Graph\")\n", " # convergence_ = gr.Button(\"Convergence Score\")\n", " # competitor_ = gr.Button(\"Competitor Analysis\")\n", "\n", "\n", " # add image output\n", " with gr.Tab(\"Word Cloud\"):\n", " Imaj = gr.Image(label=\"Word Cloud\")\n", " \n", " \n", " \n", " with gr.Tab(\"Polarity\"):\n", " Polarity = gr.Image(label=\"Polarity\")\n", " \n", "\n", " with gr.Column(scale=2):\n", " \n", "\n", " with gr.Tab(\"Text\"):\n", " tab = 'Summary'\n", " data_answer = gr.Textbox(label=\"\", lines=10)\n", " gr.Textbox.style(data_answer, show_copy_button=True)\n", "\n", " with gr.Row():\n", " with gr.Column(scale=6):\n", " # add empty space\n", " pass\n", " with gr.Column(scale=.5, min_width=100):\n", " save_button = gr.Button(\"Save\", visible=True, full_width=False)\n", " save_button.click(save_output(tab))\n", "\n", "\n", " with gr.Tab(\"Table\"):\n", " tab = 'Table'\n", " similar_reviews = gr.Dataframe(label=\"Table\", type=\"pandas\", max_rows=20, overflow_row_behaviour='paginate')\n", " with gr.Row():\n", " with gr.Column(scale=6):\n", " # add empty space\n", " pass\n", " with gr.Column(scale=.5, min_width=100):\n", " save_button = gr.Button(\"Save\", visible=True, full_width=False)\n", " save_button.click(save_output(tab))\n", "\n", " \n", "\n", " with gr.Tab(\"Graph\"):\n", " tab = 'Graph'\n", " graph = gr.Plot(label=\"Graph\")\n", " \n", " with gr.Row():\n", " with gr.Column(scale=6):\n", " # add empty space\n", " pass\n", " with gr.Column(scale=.5, min_width=100):\n", " save_button = gr.Button(\"Save\", visible=True, full_width=False)\n", " gr.Button.style(save_button, color=\"secondary\")\n", " save_button.click(save_output(tab))\n", "\n", "\n", " \n", " \n", "\n", " with gr.Row():\n", " with gr.Column(scale=3):\n", " query = gr.Textbox(label='Query', lines=1, placeholder=\"Ask a question to the dataset...\")\n", " gr.Textbox.style(query, show_copy_button=True)\n", " with gr.Column(scale=.1, min_width=200):\n", "\n", "\n", " audio = gr.Audio(source=\"microphone\", type=\"filepath\", label=\"Audio\")\n", " \n", "\n", " \n", " def get_audio(audio):\n", " \n", " if audio != None:\n", " # time.sleep(1)\n", " audio = whisper.load_audio(audio)\n", " audio = whisper.pad_or_trim(audio)\n", "\n", " # make log-Mel spectrogram and move to the same device as the model\n", " mel = whisper.log_mel_spectrogram(audio).to(model_whisp.device)\n", "\n", " # Decode audio to Text\n", " options = whisper.DecodingOptions(fp16=False)\n", "\n", " result = whisper.decode(model_whisp, mel, options)\n", " audio_output = result.text \n", " return audio_output\n", " else: \n", " return None\n", "\n", "\n", " audio.change(get_audio, audio, query)\n", "\n", "\n", " samples = [[\"What insights can we take from this data?\"], [\"What is the meaning of Life?\"]]\n", "\n", "\n", " def update_examples(query):\n", " global samples\n", " samples = generate_new_examples(query)\n", " return gr.Dataset.update(samples=samples)\n", "\n", " def print_samples():\n", " global samples\n", " return {\"samples\": samples}\n", "\n", " def load_example(example_id):\n", " global samples\n", " return samples[example_id][0]\n", "\n", " submit_button = gr.Button(\"Submit\")\n", " gr.Button.style(submit_button, color=\"secondary\")\n", "\n", " examples = gr.Dataset(samples=samples, components=[query], type=\"index\")\n", " \n", " \n", " csv_file.change(fn=load_csv, inputs=[csv_file], outputs=[data_storage, data_answer, similar_reviews]).then(generate_wordcloud, inputs=[data_storage], outputs=[Imaj]).then(get_polarity, inputs=[data_storage], outputs=[Polarity, data_storage])\n", "\n", "\n", "\n", " # SUMMARIZE VIDEO CONTENT\n", " youtube_button.click(summarize_video, inputs=[youtube_url], outputs=[data_answer])\n", "\n", "\n", " # 1. SUMMARIZE THE DATASET\n", " # 2. EXTRACT EXECUTIVE SUMMARY FROM THE DATASET\n", " exec_sum.click(get_executive_summary, inputs=[data_storage, brand, industry, summaries_state], outputs=[data_answer, similar_reviews, summaries_state])\n", "\n", " # 1. CLUSTER TOPICS IN DATASET\n", " # 2. EXTRACT TOPIC SUMMARY FROM THE DATASET\n", " top_clust.click(get_cluster, inputs=[data_storage], outputs=[graph, similar_reviews, list_of_topics, data_storage])\n", " \n", "\n", " trend_analysis.click(get_temporal_evolution_of_topics, inputs=[data_storage], outputs=[graph]).then(get_topic_summary, inputs=[data_storage, list_of_topics, brand, industry, summaries_state], outputs=[data_answer, similar_reviews, summaries_state]) \n", "\n", "\n", " test_graph.click(get_graph2, inputs=[], outputs=[graph])\n", "\n", "\n", " # 1. CLASSIFY EMOTIONS IN DATASET\n", " # 2. EXTRACT EMOTIONAL SUMMARY FROM THE DATASET\n", " #emot_clust.click(emotional_mapping, inputs=[data_storage], outputs=[data_answer, similar_reviews])\n", " emot_clust.click(get_emotion_graph, inputs =[data_storage],outputs=[graph, data_storage, similar_reviews]).then(emotional_mapping, inputs=[data_storage, brand, industry], outputs=[data_answer])\n", "\n", " # 1. ADD QUESTIONS TO THE QUERY\n", " examples.click(load_example, inputs=[examples], outputs=[query]) \n", "\n", "\n", " swot_.click(get_SWOT, inputs=[data_storage, brand, industry], outputs=[data_answer]) \n", " \n", " # 1. GENERATE SYNTHETIC COMMENT\n", " # 2. FIND SIMILAR DATAChat\n", " # 3. ANSWER QUESTION WITH DATA\n", " # 4. UPDATE EXAMPLES\n", " # 5. GENERATE AUDIO OUTPUT # UNCOMMENT FOR TEXT TO SPEECH OUTPUT\n", " submit_button.click(get_synthetic_comment, inputs=[query, threshold, data_storage, ], outputs=[data_answer, similar_reviews, graph]).success(update_examples, inputs=[query], outputs=[examples])#.then(on_click_play_audio_button, inputs=[data_answer])\n", "\n", "\n", " app.launch(share=True, width=1600, height=800)\n", "\n", "if __name__ == \"__main__\":\n", " main()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fig" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import re\n", "import numpy as np\n", "import pandas as pd\n", "import umap\n", "import hdbscan\n", "import plotly.express as px\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "import openai\n", "\n", "# Step 1: Setup UMAP\n", "reduce_dim = umap.UMAP(n_components=3, n_neighbors=8, min_dist=0.55)\n", "\n", "# Step 2: Clean the Data\n", "def clean_text_quotes(text):\n", " text = re.sub(r'https?://\\S+', '', text) # Remove hyperlinks\n", " if len(re.findall(r'@[A-Za-z0-9]+', text)) > 5:\n", " text = re.sub(r'@[A-Za-z0-9]+', '', text)\n", " text = re.sub(r'\\s+', ' ', text) # Remove extra whitespace\n", " return text\n", "\n", "df = dataframe.copy()\n", "df = df[~df['translated_text'].isin(['nan', '[deleted]', '[removed]'])]\n", "df['clean_text'] = df['translated_text'].apply(clean_text_quotes)\n", "\n", "# Step 3: Embed the Text\n", "df['embeddings'] = df['translated_text'].apply(lambda x: model.encode(x))\n", "embedding = np.vstack(df.embeddings.values)\n", "\n", "# Step 4: Reduce Dimensionality\n", "umap_embeddings = reduce_dim.fit_transform(embedding)\n", "df[['x', 'y', 'z']] = pd.DataFrame(umap_embeddings, index=df.index)\n", "\n", "# Step 5: Cluster\n", "hdbscan_params = {\n", " 'min_cluster_size': int(len(df) * 0.01 + 10),\n", " 'metric': 'euclidean',\n", " 'cluster_selection_epsilon': 0.01,\n", " 'cluster_selection_method': 'leaf',\n", " 'algorithm': 'best',\n", " 'prediction_data': False,\n", " 'min_samples': 1\n", "}\n", "cluster = hdbscan.HDBSCAN(**hdbscan_params).fit(df[['x', 'y', 'z']])\n", "\n", "# Step 6: Analyze Clusters\n", "df_cluster = pd.DataFrame(pd.value_counts(cluster.labels_))\n", "clusters = pd.DataFrame(cluster.labels_)\n", "\n", "# Step 7: Feature Extraction\n", "def get_tfidf_top_features(documents, n_top=4):\n", " tfidf_vectorizer = TfidfVectorizer(min_df=0.05, max_df=0.95, max_features=10, stop_words='english')\n", " tfidf = tfidf_vectorizer.fit_transform(documents)\n", " importance = np.argsort(np.asarray(tfidf.sum(axis=0)).ravel())[::-1]\n", " tfidf_feature_names = np.array(tfidf_vectorizer.get_feature_names_out())\n", " return tfidf_feature_names[importance[:n_top]]\n", "\n", "# Step 8: Assign Names to Clusters\n", "cluster_names = pd.DataFrame(columns=['cluster_name', 'embed_index'])\n", "for i in range(len(df_cluster)):\n", " try:\n", " clstr_nm = get_tfidf_top_features(df['clean_text'][clusters[0] == i])\n", " clstr_idx = df['clean_text'][clusters[0] == i].index\n", " cluster_names = cluster_names.append({'cluster_name': clstr_nm, 'embed_index': clstr_idx}, ignore_index=True)\n", " except:\n", " pass\n", "\n", "# Step 9: Clean Cluster Names\n", "cluster_names['cluster_name'] = cluster_names['cluster_name'].astype(str).str.replace('[\\[\\]\\' ]', '')\n", "clusters_names = cluster_names.explode('embed_index')\n", "\n", "# Step 10: Merge DataFrames\n", "df2 = df.merge(clusters_names, left_index=True, right_on='embed_index')\n", "df2['cluster_name_str'] = df2['cluster_name'].apply(lambda x: ', '.join(map(str, x)))\n", "\n", "# Step 11: Visualize\n", "fig = px.scatter_3d(df2, x='x', y='y', z='z', color='cluster_name_str', hover_data=['clean_text'])\n", "fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))\n", "fig.show()\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_graph(dataframe):\n", " \n", " from sklearn.cluster import KMeans\n", " from sklearn.metrics.pairwise import cosine_similarity\n", "\n", " embeddings_array = dataframe['embeddings'].tolist()\n", "\n", " num_clusters = 3 # Adjust the number of clusters as needed\n", " kmeans = KMeans(n_clusters=num_clusters, random_state=42)\n", " cluster_labels = kmeans.fit_predict(embeddings_array)\n", "\n", "\n", " sentences = dataframe['translated_text'].tolist()\n", "\n", " G = nx.DiGraph()\n", "\n", " cos_sim_matrix = cosine_similarity(embeddings_array)\n", "\n", "\n", " for idx, label in enumerate(cluster_labels):\n", " G.add_node(idx, sentence=sentences[idx], cluster=label)\n", "\n", " for i in range(len(sentences)):\n", " for j in range(len(sentences)):\n", " if i != j:\n", " #if cos_sim_matrix[i, j] > 0.8:\n", " G.add_edge(i, j, weight=cos_sim_matrix[i, j])\n", " # else:\n", " # continue\n", "\n", " plt.figure(figsize=(10, 10))\n", "\n", " pos = nx.spring_layout(G, k=0.5, iterations=50)\n", "\n", "\n", " G_undirected = G.to_undirected()\n", "\n", " from community import community_louvain\n", " node_to_community = community_louvain.best_partition(G_undirected)\n", "\n", " community_to_color = {\n", " 0 : 'tab:pink',\n", " 1 : 'tab:orange',\n", " 2 : 'tab:purple',\n", " 3 : 'tab:blue',\n", " }\n", "\n", " node_color = {node: community_to_color[community_id] for node, community_id in node_to_community.items()}\n", "\n", "\n", " reducer = umap.UMAP(n_components=2, random_state=42)\n", " embeddings_2d = reducer.fit_transform(embeddings_array)\n", "\n", " def normalize_weight(weight, min_weight, max_weight):\n", " return (weight - min_weight) / (max_weight - min_weight)\n", " \n", "\n", " def visualize_graph_plotly(graph, embeddings_2d, scaling_factor=3):\n", "\n", " min_weight = min((data['weight'] for _, _, data in graph.edges(data=True)))\n", " max_weight = max((data['weight'] for _, _, data in graph.edges(data=True)))\n", "\n", " fig = go.Figure()\n", "\n", " # Add edges with width based on the normalized weights\n", " for i, j in graph.edges():\n", " weight = normalize_weight(graph[i][j]['weight'], min_weight, max_weight)\n", " fig.add_shape(\n", " type=\"line\",\n", " x0=embeddings_2d[i][0],\n", " x1=embeddings_2d[j][0],\n", " y0=embeddings_2d[i][1],\n", " y1=embeddings_2d[j][1],\n", " yref=\"y\",\n", " xref=\"x\",\n", " line=dict(color=\"rgba(211, 211, 211, 0.5)\", width=weight * scaling_factor),\n", " )\n", "\n", " # Add nodes\n", " for idx, emb in enumerate(embeddings_2d):\n", " closeness = nx.closeness_centrality(G)[idx]\n", " degree = nx.degree_centrality(G)[idx]\n", " betweenness = nx.betweenness_centrality(G)[idx]\n", " eigen = nx.eigenvector_centrality(G)[idx]\n", " fig.add_trace(\n", " go.Scatter(\n", " x=[emb[0]],\n", " y=[emb[1]],\n", " mode=\"markers+text\",\n", " text=[graph.nodes[idx][\"sentence\"]],\n", " textposition=\"bottom center\",\n", " marker=dict(color=node_color[idx][4:], size=closeness * 40),\n", " # add closeness, degree, betweenness and sentence as hover text\n", " hovertext=[f\"{graph.nodes[idx]['sentence']}
closeness_centrality: {closeness:.2f}
degree_centrality: {degree:.2f}
betweenness_centrality: {betweenness:.2f}
eigenvector_centrality: {eigen:.2f}\"],\n", " )\n", " )\n", "\n", "\n", "\n", " fig.update_layout(showlegend=False, plot_bgcolor=\"white\", width=1200, height=800)\n", " fig.update_xaxes(showticklabels=False, showgrid=False, zeroline=False,\n", " showline=False, automargin=False, showspikes=False)\n", " fig.update_yaxes(showticklabels=False, showgrid=False, zeroline=False,\n", " showline=False, automargin=False, showspikes=False)\n", " \n", " fig.update_layout(title_text=\"Test Graph Visualization\", title_x=0.5, title_font_size=30, title_font_color='black')\n", "\n", "\n", " fig.show()\n", "\n", " # add title \n", "\n", " return fig" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "get_graph(df)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.16" }, "vscode": { "interpreter": { "hash": "1e174ff41394b459b17c36a368e189ffbba640cf74d0072dbcc945e7b08140d6" } } }, "nbformat": 4, "nbformat_minor": 2 }