microlearning / functions.py
Roshanik's picture
Update functions.py
3c3a470 verified
from transformers import pipeline
from transformers import TrainingArguments, Trainer, AutoModelForSeq2SeqLM
# In[2]:
import pandas as pd
import pickle
import streamlit as st
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import wordnet
nltk.download('punkt')
nltk.download('wordnet')
# In[47]:
data3 = pd.read_csv('final2.csv')
# In[5]:
data3.info()
# In[6]:
data3.head()
# In[9]:
data3['topic'] = data3.topic.astype("string")
data3['discription'] = data3.discription.astype("string")
data3['keyword'] = data3.keyword.astype("string")
data3['level'] = data3.level.astype("string")
data3.info()
# # Data Cleaning Process
# In[10]:
data3['tag'] = data3['discription'] + " " + data3['keyword'] +" " + data3['level']
# In[11]:
def remove_symbols(text):
# Create a regular expression pattern to match unwanted symbols
pattern = r'[^\w\s]' # Matches characters that are not alphanumeric or whitespace
# Substitute matched symbols with an empty string
return re.sub(pattern, '', text.lower())
# In[12]:
data3['tag'] = data3['tag'].fillna('')
data3['tag'] = data3['tag'].apply(remove_symbols)
data3['level'] = data3['level'].apply(lambda x: x.replace(" ",""))
data3['keyword'] = data3['keyword'].fillna('')
data3.head()
# # Convert tag columns into vector
# In[14]:
cv = CountVectorizer( max_features = 5000, stop_words = 'english')
vector = cv.fit_transform(data3['tag']).toarray()
# In[18]:
ps = PorterStemmer()
# In[30]:
def preprocess_query(query):
# Lowercase the query
cleaned_query = query.lower()
# Remove punctuation (adjust as needed)
import string
punctuation = string.punctuation
cleaned_query = ''.join([char for char in cleaned_query if char not in punctuation])
# Remove stop words (optional, replace with your stop word list)
stop_words = ["the", "a", "is", "in", "of"]
cleaned_query = ' '.join([word for word in cleaned_query.split() if word not in stop_words])
# Stemming
ps = PorterStemmer()
cleaned_query = ' '.join([ps.stem(word) for word in cleaned_query.split()])
# Lemmatization
wnl = WordNetLemmatizer()
cleaned_query = ' '.join([wnl.lemmatize(word) for word in cleaned_query.split()])
return cleaned_query
# In[31]:
# # Find Similarity score for finding most related topic from dataset
# In[24]:
similar = cosine_similarity(vector)
# In[27]:
# sorted(list(enumerate(similar[1])),reverse = True, key = lambda x: x[1])[0:5]
# In[29]:
summarizer = pipeline("summarization", model="facebook/bart-base")
text_generator = pipeline("text-generation", model="gpt2")
# In[34]:
documents = []
for index, row in data3.iterrows():
topic_description = preprocess_query(row["topic"])
keywords = preprocess_query(row["keyword"])
combined_text = f"{topic_description} {keywords}" # Combine for TF-IDF
documents.append(combined_text)
# In[35]:
# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer()
# Fit the vectorizer on the documents
document_vectors = vectorizer.fit_transform(documents)
def recommend_from_dataset(query):
cleaned_query = preprocess_query(query)
query_vector = vectorizer.transform([cleaned_query])
# Calculate cosine similarity between query and documents
cosine_similarities = cosine_similarity(query_vector, document_vectors)
similarity_scores = cosine_similarities.flatten()
# Sort documents based on similarity scores
sorted_results = sorted(zip(similarity_scores, data3.index, range(len(documents))), reverse=True)
# Return top N recommendations with scores, topic names, and links (if available)
top_n_results = sorted_results[:5]
recommendations = []
for result in top_n_results:
score = result[0]
document_id = result[1]
topic_name = data3.loc[document_id, "topic"]
link = data3.loc[document_id, "Links"] if "Links" in data3.columns else "No link available"
if score >= 0.3:
recommendations.append({"topic_name": topic_name, "link": link})
return recommendations
# In[45]:
def summarize_and_generate(user_query, recommendations):
# Summarize the user query
query_summary = summarizer(user_query, max_length=200, truncation=True)[0]["summary_text"]
# Generate creative text related to the query
generated_text = text_generator(f"Exploring the concept of {user_query}", max_length=200, num_return_sequences=3)[0]["generated_text"]
# Extract related links with scores
related_links = []
for recommendation in recommendations:
related_links.append({"topic": recommendation["topic_name"], "link": recommendation["link"]})
return {
"query_summary": query_summary.strip(),
"generated_text": generated_text.strip(),
"related_links": related_links
}
# In[46]:
# user_query = "java "
# recommendations = recommend_from_dataset(user_query)
# # Get the summary, generated text, and related links
# results = summarize_and_generate(user_query, recommendations)
# print(f"Query Summary: {results['query_summary']}")
# print(f"Creative Text: {results['generated_text']}")
# print("Related Links:")
# for link in results["related_links"]:
# print(f"- {link['topic']}: {link['link']}")
# In[ ]: