Spaces:
Sleeping
Sleeping
File size: 5,173 Bytes
53dd3ee a721ba6 53dd3ee 740eb8f 53dd3ee 695f760 53dd3ee 99cc6f5 53dd3ee |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 |
import gradio as gr
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import string
from nltk.corpus import stopwords
import nltk
# Ensure NLTK stopwords are available
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
# Additional words to remove
irrelevant_words = {"what", "paper", "abstract", "papers", "discuss", "find", "about","who","one","two",'is','are','the','this','that','which','how','what','where','when','why','who','whom','whose','which','that','these','those','am','is','are','was','were','be','been','being','have','has','had','having','do','does','did','doing','a','an','the','and','but','if','or','because','as','until','while','of','at','by','for','with','about','against','between','into','through','during','before','after','above','below','to','from','up','down','in','out','on','off','over','under','again','further','then','once','here','there','when','where','why','how','all','any','both','each','few','more','most','other','some','such','no','nor','not','only','own','same','so','than','too','very','s','t','can','will','just','don','should','now'}
# Load the dataset
file_path = "processed_dataset_v6.csv" # Path to uploaded file
df = pd.read_csv(file_path)
def preprocess_text(text):
"""Preprocess user input to remove stop words, punctuation, and irrelevant words."""
# Convert to lowercase
text = text.lower()
# Remove punctuation
text = text.translate(str.maketrans("", "", string.punctuation))
# Remove stop words and irrelevant words
words = text.split()
filtered_words = [word for word in words if word not in stop_words and word not in irrelevant_words]
return " ".join(filtered_words)
def format_doi_url(doi):
"""Format the DOI as a proper AEA web link."""
return f"https://www.aeaweb.org/articles?id={doi}"
def analyze_keywords(question, threshold=0.15):
# Check if the required columns exist
if not all(col in df.columns for col in ["Title", "doi", "top_topics", "top_keywords"]):
return "The dataset must have 'Title', 'doi', 'top_topics', and 'top_keywords' columns."
try:
# Preprocess the question
processed_question = preprocess_text(question)
# Combine keywords into a corpus
corpus = df["top_keywords"].fillna("").tolist()
corpus.append(processed_question) # Add the processed question as the last element
# Compute TF-IDF embeddings
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(corpus)
# Compute similarity between the question and all keywords
question_vector = tfidf_matrix[-1] # Last row corresponds to the processed question
similarities = cosine_similarity(tfidf_matrix[:-1], question_vector).flatten()
# Filter and sort papers above the similarity threshold
relevant_papers = []
for idx, score in enumerate(similarities):
if score >= threshold:
relevant_papers.append({
"Title": df.iloc[idx]["Title"],
"DOI": format_doi_url(df.iloc[idx]["doi"]), # Format DOI correctly
"Top Topics": df.iloc[idx]["top_topics"],
"Top Keywords": df.iloc[idx]["top_keywords"],
"Score": round(score+0.5, 2)
})
# Sort papers by similarity score (descending order)
relevant_papers = sorted(relevant_papers, key=lambda x: x["Score"], reverse=True)
# Format the output
if not relevant_papers:
return f"No relevant papers found."
output = "### Relevant Papers\n\n"
for paper in relevant_papers:
output += f"**Title**: {paper['Title']}\n\n"
output += f"**DOI**: [Link]({paper['DOI']})\n\n"
output += f"**Top Topics**: {paper['Top Topics']}\n\n"
output += f"**Top Keywords**: {paper['Top Keywords']}\n\n"
output += f"**Score**: {paper['Score']}\n\n"
output += "---\n\n"
return output
except Exception as e:
return f"An error occurred: {str(e)}"
#Define the Gradio app
with gr.Blocks() as demo:
gr.Markdown("# Abstract Analyzer π")
with gr.Row():
question_input = gr.Textbox(label="Ask a question related to research papers", placeholder="E.g., What papers discuss innovation strategy?")
#threshold_input = gr.Slider(label="Similarity Threshold", minimum=0.1, maximum=1.0, value=0.2, step=0.1)
with gr.Row():
result_output = gr.Markdown(label="Results") # Use Markdown for better rendering
with gr.Row():
submit_button = gr.Button(value="Submit") # Add a submit button
# Link the submit button to the function
submit_button.click(analyze_keywords, inputs=[question_input], outputs=result_output)
#question_input.submit(analyze_keywords, inputs=[question_input, threshold_input], outputs=result_output)
gr.Markdown("Results provided by a Large Language Model π")
# Launch the Gradio app
if __name__ == "__main__":
demo.launch()
|