Spaces:
Running
Running
import os | |
import gradio | |
import pandas as pd | |
import psycopg2 | |
import re | |
import nltk | |
from nltk.tokenize import word_tokenize | |
from nltk.tag import pos_tag | |
from nltk.corpus import stopwords | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.metrics.pairwise import cosine_similarity | |
import unicodedata | |
import json | |
nltk.download('punkt') | |
nltk.download('averaged_perceptron_tagger') | |
nltk.download('stopwords') | |
def get_paragraph(row, index): | |
ans = '' | |
for x in row[index]: | |
ans = ans + ' ' + x.lower() | |
return ans | |
def remove_accents(text): | |
text = unicodedata.normalize('NFKD', text).encode( | |
'ASCII', 'ignore').decode('utf-8') | |
return text | |
def get_clean_text(row, index): | |
if not isinstance(row[index], str): | |
return '' | |
if row[index] == "NULL": | |
return '' | |
clean_text = '' | |
words = word_tokenize(row[index].lower()) | |
for word in words: | |
word = word.replace(',', ' ') | |
word = remove_accents(word) | |
if re.match(r'^[a-zA-Z]+$', word) and word not in stop_words and len(word) > 1 and word[1] != '.': | |
clean_text += ' ' + word | |
return clean_text | |
def combine(row, indices): | |
ans = '' | |
for i in indices: | |
ans = ans + ' ' + row[i] | |
return ans | |
stop_words = set(stopwords.words('english')) | |
query = "SELECT * FROM base_springerdata" | |
CACHE = {} | |
SQL_KEY = 'sql' | |
JOURNAL_COMPLETE = 'journal_complete' | |
JOURNAL_PARTIAL = 'journal_partial' | |
VECTORIZER = 'vectorizer' | |
JOURNAL_TFIDF = 'journal_tfidf' | |
# Access the secrets | |
HOST = os.getenv('DATABASE_HOST') | |
DATABASE = os.getenv('DATABASE_NAME') | |
USER = os.getenv('DATABASE_USER') | |
PASSWORD = os.getenv('DATABASE_PASSWORD') | |
# load sql | |
def load_sql_data(query): | |
if SQL_KEY in CACHE: | |
return CACHE[SQL_KEY] | |
try: | |
conn = psycopg2.connect( | |
host=HOST, | |
database=DATABASE, | |
user=USER, | |
password=PASSWORD | |
) | |
df = pd.read_sql_query(query, conn) | |
df = df.drop(['item_doi'], axis=1) | |
# Close the database connection | |
conn.close() | |
CACHE[SQL_KEY] = df | |
return df | |
except psycopg2.Error: | |
# If there is an error connecting to the database, load data from the compressed CSV file | |
df = pd.read_csv('compressed_data.bz2', compression='bz2') | |
df = df.drop(['item_doi'], axis=1) | |
CACHE[SQL_KEY] = df | |
return df | |
# main_df | |
main_df = load_sql_data(query) | |
# load journal_df | |
def get_journal_df(df): | |
if JOURNAL_PARTIAL in CACHE: | |
return CACHE[JOURNAL_PARTIAL] | |
journal_art = df.groupby('publication_title')['item_title'].apply( | |
list).reset_index(name='Articles') | |
journal_art.set_index(['publication_title'], inplace=True) | |
journal_auth = df.groupby('publication_title')['authors'].apply( | |
list).reset_index(name='authors') | |
journal_auth.set_index('publication_title', inplace=True) | |
journal_key = df.drop_duplicates( | |
subset=["publication_title", "keywords"], keep='first') | |
journal_key = journal_key.drop( | |
['item_title', 'authors', 'publication_year', 'url'], axis=1) | |
journal_key.set_index(['publication_title'], inplace=True) | |
journal_main = journal_art.join([journal_key, journal_auth]) | |
print('journal_main intial') | |
journal_main.reset_index(inplace=True) | |
journal_main['Articles'] = journal_main.apply( | |
get_paragraph, index='Articles', axis=1) | |
journal_main['Articles'] = journal_main.apply( | |
get_clean_text, index='Articles', axis=1) | |
journal_main['authors'] = journal_main.apply( | |
get_paragraph, index='authors', axis=1) | |
journal_main['authors'] = journal_main.apply( | |
get_clean_text, index='authors', axis=1) | |
journal_main['keywords'] = journal_main.apply( | |
get_clean_text, index='keywords', axis=1) | |
journal_main['Tags'] = journal_main.apply( | |
combine, indices=['keywords', 'Articles', 'authors'], axis=1) | |
journal_main['Tags'] = journal_main.apply( | |
get_clean_text, index='Tags', axis=1) | |
CACHE[JOURNAL_PARTIAL] = journal_main | |
return journal_main | |
# Journal Dataframe | |
journal_main = get_journal_df(main_df) | |
print('journal_main processed') | |
# load tfidfs | |
def get_tfidfs(journal_main): | |
if VECTORIZER and JOURNAL_TFIDF in CACHE: | |
return CACHE[VECTORIZER], CACHE[JOURNAL_TFIDF] | |
vectorizer = TfidfVectorizer(decode_error='ignore', strip_accents='ascii') | |
journal_tfidf_matrix = vectorizer.fit_transform(journal_main['Tags']) | |
CACHE[VECTORIZER] = vectorizer | |
CACHE[JOURNAL_TFIDF] = journal_tfidf_matrix | |
return vectorizer, journal_tfidf_matrix | |
vectorizer, journal_tfidf_matrix = get_tfidfs(journal_main) | |
print('tfids and vectorizer for journals completed') | |
def get_article_df(row): | |
article = main_df.loc[main_df['publication_title'] == | |
journal_main['publication_title'][row.name]].copy() | |
article['item_title'] = article.apply( | |
get_clean_text, index='item_title', axis=1) | |
article['authors'] = article.apply(get_clean_text, index='authors', axis=1) | |
article['Tokenized'] = article['item_title'].apply(word_tokenize) | |
article['Tagged'] = article['Tokenized'].apply(pos_tag) | |
article['Tags'] = article['Tagged'].apply(lambda x: [word for word, tag in x if | |
tag.startswith('NN') or tag.startswith('JJ') and word.lower() not in stop_words]) | |
article['Tags'] = article.apply(get_paragraph, index='Tags', axis=1) | |
article['Tags'] = article.apply( | |
lambda x: x['Tags'] + ' ' + x['authors'] + ' ' + str(x['publication_year']), axis=1) | |
article = article.drop(['keywords', 'publication_title', | |
'Tokenized', 'Tagged', 'authors', 'publication_year'], axis=1) | |
article.reset_index(inplace=True) | |
article.set_index('index', inplace=True) | |
return article | |
def get_vectorizer(row): | |
vectorizer = TfidfVectorizer(decode_error='ignore', strip_accents='ascii') | |
return vectorizer | |
def get_tfidf_matrix(row): | |
tfidf_matrix = row['article_vectorizer'].fit_transform( | |
row['article_df']['Tags']) | |
return tfidf_matrix | |
def article_preprocessing(df): | |
if JOURNAL_COMPLETE in CACHE: | |
return CACHE[JOURNAL_COMPLETE] | |
df['article_df'] = df.apply(get_article_df, axis=1) | |
df['article_vectorizer'] = df.apply(get_vectorizer, axis=1) | |
df['article_matrix'] = df.apply(get_tfidf_matrix, axis=1) | |
CACHE[JOURNAL_COMPLETE] = df | |
return df | |
journal_main = article_preprocessing(journal_main) | |
print('done') | |
# prediction | |
journal_threshold = 4 | |
def get_journal_index(user_input): | |
user_tfidf = vectorizer.transform([user_input]) | |
cosine_similarities = cosine_similarity( | |
user_tfidf, journal_tfidf_matrix).flatten() | |
indices = cosine_similarities.argsort()[::-1] | |
top_recommendations = [i for i in indices if cosine_similarities[i] > 0][:min( | |
journal_threshold, len(indices))] | |
return top_recommendations | |
article_threshold = 10 | |
def get_article_recommendations(user_input): | |
recommended_journals = get_journal_index(user_input) | |
recommendations = [] | |
for journal_id in recommended_journals: | |
user_tfidf = journal_main['article_vectorizer'][journal_id].transform([ | |
user_input]) | |
cosine_similarities = cosine_similarity( | |
user_tfidf, journal_main['article_matrix'][journal_id]).flatten() | |
indices = cosine_similarities.argsort()[::-1] | |
top_recommendation_articles = [(cosine_similarities[i], i, journal_id) for i in indices if | |
cosine_similarities[i] > 0][:min(article_threshold, len(indices))] | |
recommendations += top_recommendation_articles | |
recommendations.sort(reverse=True) | |
return recommendations | |
def validation(text): | |
words = word_tokenize(text) | |
# Perform part-of-speech tagging | |
tagged_words = pos_tag(words) | |
# Check if any adjective or noun is present | |
adjectives = [word for word, pos in tagged_words if pos.startswith('JJ')] | |
nouns = [word for word, pos in tagged_words if pos.startswith('NN')] | |
result = {} | |
if not adjectives and not nouns: | |
result['validation'] = 'invalid' | |
else: | |
adjective_str = ' '.join(adjectives) | |
noun_str = ' '.join(nouns) | |
combined_sentence = f"{adjective_str} {noun_str}".strip() | |
result['validation'] = 'valid' | |
result['sentence'] = combined_sentence | |
return result | |
def get_links(user_input): | |
check=validation(user_input) | |
if check['validation'] == 'valid': | |
recommendations = get_article_recommendations(check['sentence']) | |
links = [] | |
for article in recommendations: | |
cosine_similarity, article_id, journal_id = article | |
link = { | |
"title": journal_main['article_df'][journal_id].iloc[article_id, 0], | |
"url": journal_main['article_df'][journal_id].iloc[article_id, 1], | |
"article_id": int(article_id), | |
"journal_id": int(journal_id) | |
} | |
links.append(link) | |
return links | |
else: | |
return [] | |
validation_interface = gradio.Interface( | |
fn=validation, | |
inputs="text", | |
outputs=gradio.outputs.JSON(), | |
title="Validation API - Testing API of ScholarSync", | |
description="API to validate user input" | |
) | |
links_interface = gradio.Interface( | |
fn=get_links, | |
inputs="text", | |
outputs=gradio.outputs.JSON(), | |
examples=[ | |
["AI"], | |
["Biochemicals"], | |
["Rocket Science"] | |
], | |
title="Article Links Generator API - Testing API of ScholarSync", | |
description="API to generate article recommendations based on user input" | |
) | |
# Combine interfaces into a single app | |
app = gradio.TabbedInterface([links_interface, validation_interface], ["articles link generation", "validation"]) | |
# Run the app | |
if __name__ == "__main__": | |
app.launch() | |