ArticleAPI / app.py
Sai004's picture
Update app.py
48127ed
import os
import gradio
import pandas as pd
import psycopg2
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import unicodedata
import json
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
def get_paragraph(row, index):
ans = ''
for x in row[index]:
ans = ans + ' ' + x.lower()
return ans
def remove_accents(text):
text = unicodedata.normalize('NFKD', text).encode(
'ASCII', 'ignore').decode('utf-8')
return text
def get_clean_text(row, index):
if not isinstance(row[index], str):
return ''
if row[index] == "NULL":
return ''
clean_text = ''
words = word_tokenize(row[index].lower())
for word in words:
word = word.replace(',', ' ')
word = remove_accents(word)
if re.match(r'^[a-zA-Z]+$', word) and word not in stop_words and len(word) > 1 and word[1] != '.':
clean_text += ' ' + word
return clean_text
def combine(row, indices):
ans = ''
for i in indices:
ans = ans + ' ' + row[i]
return ans
stop_words = set(stopwords.words('english'))
query = "SELECT * FROM base_springerdata"
CACHE = {}
SQL_KEY = 'sql'
JOURNAL_COMPLETE = 'journal_complete'
JOURNAL_PARTIAL = 'journal_partial'
VECTORIZER = 'vectorizer'
JOURNAL_TFIDF = 'journal_tfidf'
# Access the secrets
HOST = os.getenv('DATABASE_HOST')
DATABASE = os.getenv('DATABASE_NAME')
USER = os.getenv('DATABASE_USER')
PASSWORD = os.getenv('DATABASE_PASSWORD')
# load sql
def load_sql_data(query):
if SQL_KEY in CACHE:
return CACHE[SQL_KEY]
try:
conn = psycopg2.connect(
host=HOST,
database=DATABASE,
user=USER,
password=PASSWORD
)
df = pd.read_sql_query(query, conn)
df = df.drop(['item_doi'], axis=1)
# Close the database connection
conn.close()
CACHE[SQL_KEY] = df
return df
except psycopg2.Error:
# If there is an error connecting to the database, load data from the compressed CSV file
df = pd.read_csv('compressed_data.bz2', compression='bz2')
df = df.drop(['item_doi'], axis=1)
CACHE[SQL_KEY] = df
return df
# main_df
main_df = load_sql_data(query)
# load journal_df
def get_journal_df(df):
if JOURNAL_PARTIAL in CACHE:
return CACHE[JOURNAL_PARTIAL]
journal_art = df.groupby('publication_title')['item_title'].apply(
list).reset_index(name='Articles')
journal_art.set_index(['publication_title'], inplace=True)
journal_auth = df.groupby('publication_title')['authors'].apply(
list).reset_index(name='authors')
journal_auth.set_index('publication_title', inplace=True)
journal_key = df.drop_duplicates(
subset=["publication_title", "keywords"], keep='first')
journal_key = journal_key.drop(
['item_title', 'authors', 'publication_year', 'url'], axis=1)
journal_key.set_index(['publication_title'], inplace=True)
journal_main = journal_art.join([journal_key, journal_auth])
print('journal_main intial')
journal_main.reset_index(inplace=True)
journal_main['Articles'] = journal_main.apply(
get_paragraph, index='Articles', axis=1)
journal_main['Articles'] = journal_main.apply(
get_clean_text, index='Articles', axis=1)
journal_main['authors'] = journal_main.apply(
get_paragraph, index='authors', axis=1)
journal_main['authors'] = journal_main.apply(
get_clean_text, index='authors', axis=1)
journal_main['keywords'] = journal_main.apply(
get_clean_text, index='keywords', axis=1)
journal_main['Tags'] = journal_main.apply(
combine, indices=['keywords', 'Articles', 'authors'], axis=1)
journal_main['Tags'] = journal_main.apply(
get_clean_text, index='Tags', axis=1)
CACHE[JOURNAL_PARTIAL] = journal_main
return journal_main
# Journal Dataframe
journal_main = get_journal_df(main_df)
print('journal_main processed')
# load tfidfs
def get_tfidfs(journal_main):
if VECTORIZER and JOURNAL_TFIDF in CACHE:
return CACHE[VECTORIZER], CACHE[JOURNAL_TFIDF]
vectorizer = TfidfVectorizer(decode_error='ignore', strip_accents='ascii')
journal_tfidf_matrix = vectorizer.fit_transform(journal_main['Tags'])
CACHE[VECTORIZER] = vectorizer
CACHE[JOURNAL_TFIDF] = journal_tfidf_matrix
return vectorizer, journal_tfidf_matrix
vectorizer, journal_tfidf_matrix = get_tfidfs(journal_main)
print('tfids and vectorizer for journals completed')
def get_article_df(row):
article = main_df.loc[main_df['publication_title'] ==
journal_main['publication_title'][row.name]].copy()
article['item_title'] = article.apply(
get_clean_text, index='item_title', axis=1)
article['authors'] = article.apply(get_clean_text, index='authors', axis=1)
article['Tokenized'] = article['item_title'].apply(word_tokenize)
article['Tagged'] = article['Tokenized'].apply(pos_tag)
article['Tags'] = article['Tagged'].apply(lambda x: [word for word, tag in x if
tag.startswith('NN') or tag.startswith('JJ') and word.lower() not in stop_words])
article['Tags'] = article.apply(get_paragraph, index='Tags', axis=1)
article['Tags'] = article.apply(
lambda x: x['Tags'] + ' ' + x['authors'] + ' ' + str(x['publication_year']), axis=1)
article = article.drop(['keywords', 'publication_title',
'Tokenized', 'Tagged', 'authors', 'publication_year'], axis=1)
article.reset_index(inplace=True)
article.set_index('index', inplace=True)
return article
def get_vectorizer(row):
vectorizer = TfidfVectorizer(decode_error='ignore', strip_accents='ascii')
return vectorizer
def get_tfidf_matrix(row):
tfidf_matrix = row['article_vectorizer'].fit_transform(
row['article_df']['Tags'])
return tfidf_matrix
def article_preprocessing(df):
if JOURNAL_COMPLETE in CACHE:
return CACHE[JOURNAL_COMPLETE]
df['article_df'] = df.apply(get_article_df, axis=1)
df['article_vectorizer'] = df.apply(get_vectorizer, axis=1)
df['article_matrix'] = df.apply(get_tfidf_matrix, axis=1)
CACHE[JOURNAL_COMPLETE] = df
return df
journal_main = article_preprocessing(journal_main)
print('done')
# prediction
journal_threshold = 4
def get_journal_index(user_input):
user_tfidf = vectorizer.transform([user_input])
cosine_similarities = cosine_similarity(
user_tfidf, journal_tfidf_matrix).flatten()
indices = cosine_similarities.argsort()[::-1]
top_recommendations = [i for i in indices if cosine_similarities[i] > 0][:min(
journal_threshold, len(indices))]
return top_recommendations
article_threshold = 10
def get_article_recommendations(user_input):
recommended_journals = get_journal_index(user_input)
recommendations = []
for journal_id in recommended_journals:
user_tfidf = journal_main['article_vectorizer'][journal_id].transform([
user_input])
cosine_similarities = cosine_similarity(
user_tfidf, journal_main['article_matrix'][journal_id]).flatten()
indices = cosine_similarities.argsort()[::-1]
top_recommendation_articles = [(cosine_similarities[i], i, journal_id) for i in indices if
cosine_similarities[i] > 0][:min(article_threshold, len(indices))]
recommendations += top_recommendation_articles
recommendations.sort(reverse=True)
return recommendations
def validation(text):
words = word_tokenize(text)
# Perform part-of-speech tagging
tagged_words = pos_tag(words)
# Check if any adjective or noun is present
adjectives = [word for word, pos in tagged_words if pos.startswith('JJ')]
nouns = [word for word, pos in tagged_words if pos.startswith('NN')]
result = {}
if not adjectives and not nouns:
result['validation'] = 'invalid'
else:
adjective_str = ' '.join(adjectives)
noun_str = ' '.join(nouns)
combined_sentence = f"{adjective_str} {noun_str}".strip()
result['validation'] = 'valid'
result['sentence'] = combined_sentence
return result
def get_links(user_input):
check=validation(user_input)
if check['validation'] == 'valid':
recommendations = get_article_recommendations(check['sentence'])
links = []
for article in recommendations:
cosine_similarity, article_id, journal_id = article
link = {
"title": journal_main['article_df'][journal_id].iloc[article_id, 0],
"url": journal_main['article_df'][journal_id].iloc[article_id, 1],
"article_id": int(article_id),
"journal_id": int(journal_id)
}
links.append(link)
return links
else:
return []
validation_interface = gradio.Interface(
fn=validation,
inputs="text",
outputs=gradio.outputs.JSON(),
title="Validation API - Testing API of ScholarSync",
description="API to validate user input"
)
links_interface = gradio.Interface(
fn=get_links,
inputs="text",
outputs=gradio.outputs.JSON(),
examples=[
["AI"],
["Biochemicals"],
["Rocket Science"]
],
title="Article Links Generator API - Testing API of ScholarSync",
description="API to generate article recommendations based on user input"
)
# Combine interfaces into a single app
app = gradio.TabbedInterface([links_interface, validation_interface], ["articles link generation", "validation"])
# Run the app
if __name__ == "__main__":
app.launch()