talkbackstaging / utils /functions.py
zsoltapp's picture
Upload 101 files
a987248
import pandas as pd
import numpy as np
import requests
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
def load_csv(projJson, index, projectData, projectUUID):
# print(projectUUID)
outName = "uploader/uploads/" + projectUUID + "/" + projJson[int(index)]["filename"]
if "http" in projJson[int(index)]["filename"]:
df = readfile(projJson[int(index)]["filename"])
else:
df = readfile(outName)
if 'translated_text' not in df.columns:
try:
df['translated_text'] = df['clean_text_emotions']
except:
try:
df['translated_text'] = df['split_summary_y']
except:
try:
df['translated_text'] = df['split_summary']
except:
try:
df['translated_text'] = df['clean_text']
except:
try:
df['translated_text'] = df['text']
except:
try:
df['translated_text'] = df['content']
except:
try:
df['translated_text'] = df['Comment']
except:
try:
df['translated_text'] = df['summary']
except:
df['translated_text'] = df['body']
# df['translated_text'] in this col, remove all @xxx mentions
df['translated_text'] = df['translated_text'].str.replace(r'@\w+', '', regex=True)
if 'embeddings' not in df.columns:
if 'encoding' in df.columns:
if type(df['encoding'][0]) == str:
embeddings_array = np.vstack(df['encoding'].apply(
lambda x: np.fromstring(x[1:-1], sep=' ', dtype=np.float32)))
df['embeddings'] = list(embeddings_array)
else:
df['embeddings'] = df['encoding']
elif 'paragraph_embeddings' in df.columns:
if type(df['paragraph_embeddings'][0]) == str:
embeddings_array = np.vstack(df['paragraph_embeddings'].apply(
lambda x: np.fromstring(x[1:-1], sep=' ', dtype=np.float32)))
df['embeddings'] = list(embeddings_array)
else:
df['embeddings'] = df['paragraph_embeddings']
else:
df['embeddings'] = df['translated_text'].apply(
lambda x: model.encode(x))
# save the file
if outName.endswith('.csv'):
df.to_csv(outName, index=False)
elif outName.endswith('.xlsx'):
df.to_excel(outName, index=False)
elif outName.endswith('.h5'):
df.to_hdf(outName, key='df', mode='w')
else:
if type(df['embeddings'][0]) == str:
embeddings_array = np.vstack(df['embeddings'].apply(
lambda x: np.fromstring(x[1:-1], sep=' ', dtype=np.float32)))
df['embeddings'] = list(embeddings_array)
else:
pass
finished_loading = """<div class='ddesc'><header>
<h1>Welcome to your Talkback environment!</h1>
<p>Please select one of the predefined analysis on the left</p>
</header>
<main>
<section>
<h2>Executive Summary</h2>
<p>Our platform is a powerful tool that allows users to easily query and analyze their data. With intuitive visualizations and a simple user interface, even non-technical users can quickly gain insights into their data. Whether you're a small business owner or a data scientist, our platform has the features you need to make data-driven decisions.</p>
</section>
<section class='socialonly'>
<h2>Topic cluster</h2>
<p>Our platform now includes a powerful feature for topic clustering, allowing users to identify the main themes and topics within their data. This feature uses advanced algorithms to group similar data points together based on their content, making it easier to identify patterns and trends. With topic clustering, users can quickly gain insights into the main topics driving their data, and use this information to inform their decision-making.</p>
</section>
<section class='socialonly'>
<h2>Sentiment Analysis</h2>
<p>We've also recently added a sentiment analysis feature, which allows users to analyze the sentiment of their data. With this feature, users can understand the emotions and attitudes expressed within their data, and use this information to gain a deeper understanding of their customers or audience. Sentiment analysis can be particularly useful for businesses looking to understand customer feedback or opinions on their products or services. By analyzing the sentiment of customer feedback, businesses can identify areas for improvement and make data-driven decisions to improve customer satisfaction. </p>
</section>
<section class='newsonly'>
<h2>Trend Analysis</h2>
<p>Our platform has been seeing steady growth in user adoption, with an increasing number of businesses turning to data analysis to inform their decision-making. We've also noticed a trend towards more demand for real-time data insights, which we plan to address with upcoming features. Additionally, we've seen a shift towards cloud-based solutions, and we're exploring ways to make our platform more flexible and scalable for our users.</p>
</section>
<section class='newsonly'>
<h2>Competitor Analysis</h2>
<p>We've identified several competitors in the data analysis space, including both established players and newer startups. While some of our competitors offer more advanced features, we believe our focus on ease of use and customer service sets us apart. Our platform is designed to be accessible to users of all skill levels, from beginners to experts. We also differentiate ourselves by offering a range of integrations with popular data sources, such as Google Analytics and Salesforce.</p>
</section>
<section>
<h2>SWOT Analysis</h2>
<p>One of the strengths of our platform is its ease of use, which allows users to quickly get up and running with their data analysis. However, we also recognize that there are areas where we can improve, such as adding more advanced features and expanding our integrations. We see a major opportunity in the growing demand for data analysis tools, especially among small and medium-sized businesses. Our biggest threat comes from larger competitors with more resources, but we believe our focus on usability and customer service will set us apart.</p>
</section>
<section>
<h2>Talk to your data</h2>
<p>In addition to the predefined analysis set you can also use this option that allows you to interact with your data in a more natural and intuitive way.</p>
</section>
</main></div>"""
if projJson[int(index)]["intro"]:
finished_loading = projJson[int(index)]["intro"]
formattedres = '<p style="color:black;text-align:left;">' + finished_loading + '</p>'
tdf = df[['translated_text']][0:30]
# rename translated_text column
# tdf.rename(columns={'translated_text': 'Content'}, inplace=True)
if "http" in outName:
outName = projJson[int(index)]["label"] + "API"
return df, finished_loading, tdf, formattedres, outName, '<span class="hsub">Dataset: </span><span class="selHelper">' + projJson[int(index)]["label"] + " <i>(n="+str(len(df))+")</i></span>"
def readfile(filename):
if ".xlsx" in filename:
df = pd.read_excel(filename)
elif ".csv" in filename:
df = pd.read_csv(filename, delimiter=',', encoding='utf-8-sig')
elif ".h5" in filename:
df = pd.read_hdf(filename)
elif "http" in filename:
# api request data
response = requests.get(filename)
df = pd.DataFrame(response.json())
# HARDCODED!!!
if "QC10" in df.columns:
df = df[['QC10']]
df.dropna(inplace=True)
df.rename(columns={'QC10': 'translated_text'}, inplace=True)
if "Q12" in df.columns:
df = df[['Q12']]
df.dropna(inplace=True)
df.rename(columns={'Q12': 'translated_text'}, inplace=True)
df.fillna('', inplace=True)
return df