import pandas as pd
import numpy as np
import requests
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
def load_csv(projJson, index, projectData, projectUUID):
# print(projectUUID)
outName = "uploader/uploads/" + projectUUID + "/" + projJson[int(index)]["filename"]
if "http" in projJson[int(index)]["filename"]:
df = readfile(projJson[int(index)]["filename"])
df = readfile(outName)
if 'translated_text' not in df.columns:
df['translated_text'] = df['clean_text_emotions']
df['translated_text'] = df['split_summary_y']
df['translated_text'] = df['split_summary']
df['translated_text'] = df['clean_text']
df['translated_text'] = df['text']
df['translated_text'] = df['content']
df['translated_text'] = df['Comment']
df['translated_text'] = df['summary']
df['translated_text'] = df['body']
# df['translated_text'] in this col, remove all @xxx mentions
df['translated_text'] = df['translated_text'].str.replace(r'@\w+', '', regex=True)
if 'embeddings' not in df.columns:
if 'encoding' in df.columns:
if type(df['encoding'][0]) == str:
embeddings_array = np.vstack(df['encoding'].apply(
lambda x: np.fromstring(x[1:-1], sep=' ', dtype=np.float32)))
df['embeddings'] = list(embeddings_array)
df['embeddings'] = df['encoding']
elif 'paragraph_embeddings' in df.columns:
if type(df['paragraph_embeddings'][0]) == str:
embeddings_array = np.vstack(df['paragraph_embeddings'].apply(
lambda x: np.fromstring(x[1:-1], sep=' ', dtype=np.float32)))
df['embeddings'] = list(embeddings_array)
df['embeddings'] = df['paragraph_embeddings']
df['embeddings'] = df['translated_text'].apply(
lambda x: model.encode(x))
# save the file
if outName.endswith('.csv'):
df.to_csv(outName, index=False)
elif outName.endswith('.xlsx'):
df.to_excel(outName, index=False)
elif outName.endswith('.h5'):
df.to_hdf(outName, key='df', mode='w')
if type(df['embeddings'][0]) == str:
embeddings_array = np.vstack(df['embeddings'].apply(
lambda x: np.fromstring(x[1:-1], sep=' ', dtype=np.float32)))
df['embeddings'] = list(embeddings_array)
if projJson[int(index)]["intro"]:
finished_loading = projJson[int(index)]["intro"]
formattedres = '<p style="color:black;text-align:left;">' + finished_loading + '</p>'
tdf = df[['translated_text']][0:30]
# rename translated_text column
# tdf.rename(columns={'translated_text': 'Content'}, inplace=True)
if "http" in outName:
outName = projJson[int(index)]["label"] + "API"
return df, finished_loading, tdf, formattedres, outName, '<span class="hsub">Dataset: </span><span class="selHelper">' + projJson[int(index)]["label"] + " <i>(n="+str(len(df))+")</i></span>"
def readfile(filename):
if ".xlsx" in filename:
df = pd.read_excel(filename)
elif ".csv" in filename:
df = pd.read_csv(filename, delimiter=',', encoding='utf-8-sig')
elif ".h5" in filename:
df = pd.read_hdf(filename)
elif "http" in filename:
# api request data
response = requests.get(filename)
df = pd.DataFrame(response.json())
if "QC10" in df.columns:
df = df[['QC10']]
df.rename(columns={'QC10': 'translated_text'}, inplace=True)
if "Q12" in df.columns:
df = df[['Q12']]
df.rename(columns={'Q12': 'translated_text'}, inplace=True)
df.fillna('', inplace=True)
return df