Spaces:

Sarnika
/

Streamlit_App

Sleeping

Streamlit_App / app.py

Sarnika

final_push

96a4f53 over 2 years ago

5.46 kB

	import streamlit as st
	import gzip
	from pandas import json_normalize
	import pandas as pd
	from relavance_ranking import questions,similarity_with_questions,relevance_function
	from sentiment_analysis import sentiment_analysis
	import nltk
	nltk.download('vader_lexicon')

	@st.cache
	def first_part():

	reviews_df = pd.DataFrame()
	ReviewFile,MetaDataFile = 'reviews_file.json.gz','meta_data.json.gz'
	reviews_df = pd.read_json(ReviewFile,lines=True,compression='gzip')
	metadata_df = pd.read_json(MetaDataFile,lines=True,compression='gzip')
	reviews_df = reviews_df[['asin','reviewText','summary','verified','vote']]
	metadata_df = metadata_df[['asin','title']]
	df = pd.merge(metadata_df,reviews_df,on='asin',how='right')
	df = df.drop_duplicates()
	count_df = df.title.value_counts().to_frame().reset_index()
	count_df.columns=['title','count']
	products = list(count_df[count_df['count']>=50]['title'])

	qa_df = pd.DataFrame()
	QAFile = 'qa_file.json.gz'
	qa = gzip.open(QAFile, 'r')
	c = 0
	for l in qa:
	data = eval(l)
	temp_df = json_normalize(data)
	c = c + 1
	if c == 1:
	qa_df = temp_df
	else:
	qa_df = pd.concat([qa_df, temp_df])
	return {'products':products,'df':df,'qa_df':qa_df}

	products = first_part()['products']
	df = first_part()['df']
	qa_df = first_part()['qa_df']
	selected_product = st.sidebar.selectbox('Select Product', products)
	st.title("Help-A-Purchase")
	st.header(selected_product)
	selected_df = df[df['title']==selected_product]
	st.metric(label="Reviews", value=len(selected_df))
	import string
	from nltk.tokenize.treebank import TreebankWordDetokenizer
	from nltk.tokenize import RegexpTokenizer


	re_tokenizer = RegexpTokenizer("[\\w']+")
	punc = string.punctuation
	digits = string.digits
	stopwords_nltk = set(nltk.corpus.stopwords.words('english'))

	selected_df['reviewText'] = selected_df['reviewText'].astype(str)
	selected_df['reviewText_cleaned'] = selected_df['reviewText'].apply(re_tokenizer.tokenize)
	selected_df['reviewText_cleaned'] = selected_df['reviewText_cleaned'].apply(lambda x: [word.lower() for word in x])
	selected_df['reviewText_cleaned'] = selected_df['reviewText_cleaned'].apply(lambda x: [word for word in x if word not in punc])
	selected_df['reviewText_cleaned'] = selected_df['reviewText_cleaned'].apply(lambda x: [word for word in x if word not in digits])
	selected_df['reviewText_cleaned'] = selected_df['reviewText_cleaned'].apply(lambda x: [word for word in x if word not in stopwords_nltk])
	selected_df['reviewText_cleaned'] = selected_df['reviewText_cleaned'].apply(lambda x: [TreebankWordDetokenizer().detokenize(x)]) # Detokenize
	selected_df['reviewText_cleaned'] = [','.join(map(str, l)) for l in selected_df['reviewText_cleaned']] # Convert list to single string per row


	first_part()
	dataframe= sentiment_analysis(selected_df)['dataframe']
	dataframe = dataframe.assign(flag = 'Negative')
	dataframe.loc[dataframe['vader_compound_score'] > 0.33, 'flag'] = 'Neutral'
	dataframe.loc[dataframe['vader_compound_score'] > 0.66, 'flag'] = 'Positive'
	count_flag_df = dataframe['flag'].value_counts(normalize=True) * 100
	count_flag_df = count_flag_df.to_frame()
	count_flag_df = count_flag_df.round(1)
	count_flag_df.reset_index(inplace=True)
	negative = float(count_flag_df[count_flag_df['index']=='Negative']['flag'])
	neutral = float(count_flag_df[count_flag_df['index']=='Neutral']['flag'])
	positive = float(count_flag_df[count_flag_df['index']=='Positive']['flag'])



	@st.cache
	def summary():
	positive_summary = sentiment_analysis(selected_df)['pos_summary']
	negative_summary = sentiment_analysis(selected_df)['neg_summary']
	return {'positive_summary':positive_summary,'negative_summary':negative_summary}
	summary = summary()
	tab1, tab2, = st.tabs(["Sentiment Analysis & Summarization", "Relavance Ranking"])
	with tab1:
	with st.expander("Sentiment Distribution"):
	col1,col2,col3=st.columns(3)
	col1.metric(label="% Positive", value=positive)
	col2.metric(label="% Neutral", value=neutral)
	col3.metric(label="% Negative", value=negative)
	with st.expander("Summarization"):
	st.subheader('Top 10 Positive Reviews')
	st.markdown(summary['positive_summary'])
	st.subheader('Top 10 Negative Reviews')
	st.markdown(summary['negative_summary'])


	prod_id = list(selected_df['asin'].unique())[0]
	questions = questions(qa_df, selected_df)



	@st.cache
	def output_df():
	output_df = similarity_with_questions(qa_df, selected_df, prod_id, questions)
	return output_df

	output_df = output_df()

	@st.cache
	def output_df_open(open):
	output_df = similarity_with_questions(qa_df, selected_df, prod_id, questions,open)
	return output_df
	with tab2:
	with st.expander("Existing Questions"):
	if len(questions)==0:
	st.write("No open-ended questions for this product!")
	else:
	question = st.selectbox('Question', questions)
	final_df = relevance_function(output_df, question)
	st.dataframe(final_df.head(3))



	with st.expander('Ask a question!'):
	open = st.text_input("Enter Question")
	if not(open == ''):
	open = ' '+open
	questions = questions + [open]
	output_df_open = output_df_open(open)
	final_df_open = relevance_function(output_df_open, open)
	st.dataframe(final_df_open.head(3))