Spaces:

Sai004
/

ArticleAPI

Running

App Files Files Community

ArticleAPI / app.py

Sai004

Update app.py

48127ed almost 2 years ago

raw

history blame contribute delete

10 kB

	import os
	import gradio
	import pandas as pd
	import psycopg2
	import re
	import nltk
	from nltk.tokenize import word_tokenize
	from nltk.tag import pos_tag
	from nltk.corpus import stopwords
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity
	import unicodedata
	import json

	nltk.download('punkt')
	nltk.download('averaged_perceptron_tagger')
	nltk.download('stopwords')



	def get_paragraph(row, index):
	ans = ''
	for x in row[index]:
	ans = ans + ' ' + x.lower()
	return ans


	def remove_accents(text):
	text = unicodedata.normalize('NFKD', text).encode(
	'ASCII', 'ignore').decode('utf-8')
	return text


	def get_clean_text(row, index):
	if not isinstance(row[index], str):
	return ''
	if row[index] == "NULL":
	return ''
	clean_text = ''
	words = word_tokenize(row[index].lower())
	for word in words:
	word = word.replace(',', ' ')
	word = remove_accents(word)
	if re.match(r'^[a-zA-Z]+$', word) and word not in stop_words and len(word) > 1 and word[1] != '.':
	clean_text += ' ' + word
	return clean_text


	def combine(row, indices):
	ans = ''
	for i in indices:
	ans = ans + ' ' + row[i]
	return ans


	stop_words = set(stopwords.words('english'))
	query = "SELECT * FROM base_springerdata"

	CACHE = {}
	SQL_KEY = 'sql'
	JOURNAL_COMPLETE = 'journal_complete'
	JOURNAL_PARTIAL = 'journal_partial'
	VECTORIZER = 'vectorizer'
	JOURNAL_TFIDF = 'journal_tfidf'

	# Access the secrets
	HOST = os.getenv('DATABASE_HOST')
	DATABASE = os.getenv('DATABASE_NAME')
	USER = os.getenv('DATABASE_USER')
	PASSWORD = os.getenv('DATABASE_PASSWORD')
	# load sql

	def load_sql_data(query):
	if SQL_KEY in CACHE:
	return CACHE[SQL_KEY]
	try:
	conn = psycopg2.connect(
	host=HOST,
	database=DATABASE,
	user=USER,
	password=PASSWORD
	)
	df = pd.read_sql_query(query, conn)
	df = df.drop(['item_doi'], axis=1)

	# Close the database connection
	conn.close()

	CACHE[SQL_KEY] = df
	return df

	except psycopg2.Error:
	# If there is an error connecting to the database, load data from the compressed CSV file
	df = pd.read_csv('compressed_data.bz2', compression='bz2')
	df = df.drop(['item_doi'], axis=1)

	CACHE[SQL_KEY] = df
	return df

	# main_df
	main_df = load_sql_data(query)


	# load journal_df
	def get_journal_df(df):
	if JOURNAL_PARTIAL in CACHE:
	return CACHE[JOURNAL_PARTIAL]
	journal_art = df.groupby('publication_title')['item_title'].apply(
	list).reset_index(name='Articles')
	journal_art.set_index(['publication_title'], inplace=True)

	journal_auth = df.groupby('publication_title')['authors'].apply(
	list).reset_index(name='authors')
	journal_auth.set_index('publication_title', inplace=True)

	journal_key = df.drop_duplicates(
	subset=["publication_title", "keywords"], keep='first')
	journal_key = journal_key.drop(
	['item_title', 'authors', 'publication_year', 'url'], axis=1)
	journal_key.set_index(['publication_title'], inplace=True)

	journal_main = journal_art.join([journal_key, journal_auth])
	print('journal_main intial')
	journal_main.reset_index(inplace=True)
	journal_main['Articles'] = journal_main.apply(
	get_paragraph, index='Articles', axis=1)
	journal_main['Articles'] = journal_main.apply(
	get_clean_text, index='Articles', axis=1)
	journal_main['authors'] = journal_main.apply(
	get_paragraph, index='authors', axis=1)
	journal_main['authors'] = journal_main.apply(
	get_clean_text, index='authors', axis=1)
	journal_main['keywords'] = journal_main.apply(
	get_clean_text, index='keywords', axis=1)

	journal_main['Tags'] = journal_main.apply(
	combine, indices=['keywords', 'Articles', 'authors'], axis=1)
	journal_main['Tags'] = journal_main.apply(
	get_clean_text, index='Tags', axis=1)
	CACHE[JOURNAL_PARTIAL] = journal_main
	return journal_main


	# Journal Dataframe
	journal_main = get_journal_df(main_df)
	print('journal_main processed')


	# load tfidfs
	def get_tfidfs(journal_main):
	if VECTORIZER and JOURNAL_TFIDF in CACHE:
	return CACHE[VECTORIZER], CACHE[JOURNAL_TFIDF]
	vectorizer = TfidfVectorizer(decode_error='ignore', strip_accents='ascii')
	journal_tfidf_matrix = vectorizer.fit_transform(journal_main['Tags'])
	CACHE[VECTORIZER] = vectorizer
	CACHE[JOURNAL_TFIDF] = journal_tfidf_matrix
	return vectorizer, journal_tfidf_matrix


	vectorizer, journal_tfidf_matrix = get_tfidfs(journal_main)
	print('tfids and vectorizer for journals completed')


	def get_article_df(row):
	article = main_df.loc[main_df['publication_title'] ==
	journal_main['publication_title'][row.name]].copy()
	article['item_title'] = article.apply(
	get_clean_text, index='item_title', axis=1)
	article['authors'] = article.apply(get_clean_text, index='authors', axis=1)
	article['Tokenized'] = article['item_title'].apply(word_tokenize)
	article['Tagged'] = article['Tokenized'].apply(pos_tag)
	article['Tags'] = article['Tagged'].apply(lambda x: [word for word, tag in x if
	tag.startswith('NN') or tag.startswith('JJ') and word.lower() not in stop_words])
	article['Tags'] = article.apply(get_paragraph, index='Tags', axis=1)
	article['Tags'] = article.apply(
	lambda x: x['Tags'] + ' ' + x['authors'] + ' ' + str(x['publication_year']), axis=1)
	article = article.drop(['keywords', 'publication_title',
	'Tokenized', 'Tagged', 'authors', 'publication_year'], axis=1)
	article.reset_index(inplace=True)
	article.set_index('index', inplace=True)
	return article


	def get_vectorizer(row):
	vectorizer = TfidfVectorizer(decode_error='ignore', strip_accents='ascii')
	return vectorizer


	def get_tfidf_matrix(row):
	tfidf_matrix = row['article_vectorizer'].fit_transform(
	row['article_df']['Tags'])
	return tfidf_matrix


	def article_preprocessing(df):
	if JOURNAL_COMPLETE in CACHE:
	return CACHE[JOURNAL_COMPLETE]
	df['article_df'] = df.apply(get_article_df, axis=1)
	df['article_vectorizer'] = df.apply(get_vectorizer, axis=1)
	df['article_matrix'] = df.apply(get_tfidf_matrix, axis=1)
	CACHE[JOURNAL_COMPLETE] = df
	return df


	journal_main = article_preprocessing(journal_main)
	print('done')


	# prediction
	journal_threshold = 4


	def get_journal_index(user_input):
	user_tfidf = vectorizer.transform([user_input])
	cosine_similarities = cosine_similarity(
	user_tfidf, journal_tfidf_matrix).flatten()
	indices = cosine_similarities.argsort()[::-1]
	top_recommendations = [i for i in indices if cosine_similarities[i] > 0][:min(
	journal_threshold, len(indices))]
	return top_recommendations


	article_threshold = 10


	def get_article_recommendations(user_input):
	recommended_journals = get_journal_index(user_input)
	recommendations = []
	for journal_id in recommended_journals:
	user_tfidf = journal_main['article_vectorizer'][journal_id].transform([
	user_input])
	cosine_similarities = cosine_similarity(
	user_tfidf, journal_main['article_matrix'][journal_id]).flatten()
	indices = cosine_similarities.argsort()[::-1]
	top_recommendation_articles = [(cosine_similarities[i], i, journal_id) for i in indices if
	cosine_similarities[i] > 0][:min(article_threshold, len(indices))]
	recommendations += top_recommendation_articles
	recommendations.sort(reverse=True)
	return recommendations


	def validation(text):
	words = word_tokenize(text)
	# Perform part-of-speech tagging
	tagged_words = pos_tag(words)
	# Check if any adjective or noun is present
	adjectives = [word for word, pos in tagged_words if pos.startswith('JJ')]
	nouns = [word for word, pos in tagged_words if pos.startswith('NN')]

	result = {}

	if not adjectives and not nouns:
	result['validation'] = 'invalid'
	else:
	adjective_str = ' '.join(adjectives)
	noun_str = ' '.join(nouns)
	combined_sentence = f"{adjective_str} {noun_str}".strip()
	result['validation'] = 'valid'
	result['sentence'] = combined_sentence

	return result


	def get_links(user_input):
	check=validation(user_input)
	if check['validation'] == 'valid':
	recommendations = get_article_recommendations(check['sentence'])
	links = []
	for article in recommendations:
	cosine_similarity, article_id, journal_id = article
	link = {
	"title": journal_main['article_df'][journal_id].iloc[article_id, 0],
	"url": journal_main['article_df'][journal_id].iloc[article_id, 1],
	"article_id": int(article_id),
	"journal_id": int(journal_id)
	}
	links.append(link)
	return links
	else:
	return []



	validation_interface = gradio.Interface(
	fn=validation,
	inputs="text",
	outputs=gradio.outputs.JSON(),
	title="Validation API - Testing API of ScholarSync",
	description="API to validate user input"
	)


	links_interface = gradio.Interface(
	fn=get_links,
	inputs="text",
	outputs=gradio.outputs.JSON(),
	examples=[
	["AI"],
	["Biochemicals"],
	["Rocket Science"]
	],
	title="Article Links Generator API - Testing API of ScholarSync",
	description="API to generate article recommendations based on user input"
	)

	# Combine interfaces into a single app
	app = gradio.TabbedInterface([links_interface, validation_interface], ["articles link generation", "validation"])

	# Run the app
	if __name__ == "__main__":
	app.launch()