{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "ecbb6eac", "metadata": {}, "outputs": [], "source": [ "from transformers import pipeline\n", "from transformers import TrainingArguments, Trainer, AutoModelForSeq2SeqLM" ] }, { "cell_type": "code", "execution_count": 2, "id": "a4bac354", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "from sklearn.feature_extraction.text import CountVectorizer\n", "import nltk\n", "from nltk.stem.porter import PorterStemmer\n", "from nltk.stem import WordNetLemmatizer\n", "import re\n", "from sklearn.metrics.pairwise import cosine_similarity\n", "from fuzzywuzzy import fuzz\n", "from sklearn.feature_extraction.text import TfidfVectorizer" ] }, { "cell_type": "code", "execution_count": 47, "id": "bfe7183c", "metadata": {}, "outputs": [], "source": [ "\n", "data3 = pd.read_csv('final2.csv')" ] }, { "cell_type": "code", "execution_count": 5, "id": "22f9643b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 3720 entries, 0 to 3719\n", "Data columns (total 6 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Unnamed: 0 3720 non-null int64 \n", " 1 topic 3720 non-null object\n", " 2 discription 1748 non-null object\n", " 3 keyword 3204 non-null object\n", " 4 Links 3720 non-null object\n", " 5 level 3720 non-null object\n", "dtypes: int64(1), object(5)\n", "memory usage: 174.5+ KB\n" ] } ], "source": [ "data3.info()" ] }, { "cell_type": "code", "execution_count": 6, "id": "6ef84197", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0topicdiscriptionkeywordLinkslevel
00JavaJava is a general-purpose computer programming...Java, James Gosling, website, wikipedia, docum...website: https://oracle.com/java/, documentati...beginner to advance
11JavaScriptJavaScript (), often abbreviated as JS, is a h...JavaScript, Brendan Eich, reference, wikipedia...reference: https://www.w3schools.com/js/js_res...beginner to advance
22CC (, as in the letter c) is a general-purpose,...C, Dennis Ritchie, reference, wikipedia, docum...reference: http://www.c4learn.com/c-programmin...beginner to advance
33PythonPython is a widely used high-level programming...Python, Guido van Rossum, website, reference, ...website: https://www.python.org/, reference: h...beginner to advance
44SQLSQL ( ( listen) ESS-kew-EL or ( listen) SEE-k...SQL, Donald D. Chamberlin and Raymond F. Boyce...documentation: https://docs.data.world/documen...beginner to advance
\n", "
" ], "text/plain": [ " Unnamed: 0 topic discription \\\n", "0 0 Java Java is a general-purpose computer programming... \n", "1 1 JavaScript JavaScript (), often abbreviated as JS, is a h... \n", "2 2 C C (, as in the letter c) is a general-purpose,... \n", "3 3 Python Python is a widely used high-level programming... \n", "4 4 SQL SQL ( ( listen) ESS-kew-EL or ( listen) SEE-k... \n", "\n", " keyword \\\n", "0 Java, James Gosling, website, wikipedia, docum... \n", "1 JavaScript, Brendan Eich, reference, wikipedia... \n", "2 C, Dennis Ritchie, reference, wikipedia, docum... \n", "3 Python, Guido van Rossum, website, reference, ... \n", "4 SQL, Donald D. Chamberlin and Raymond F. Boyce... \n", "\n", " Links level \n", "0 website: https://oracle.com/java/, documentati... beginner to advance \n", "1 reference: https://www.w3schools.com/js/js_res... beginner to advance \n", "2 reference: http://www.c4learn.com/c-programmin... beginner to advance \n", "3 website: https://www.python.org/, reference: h... beginner to advance \n", "4 documentation: https://docs.data.world/documen... beginner to advance " ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data3.head()" ] }, { "cell_type": "code", "execution_count": 9, "id": "acf74e04", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 3720 entries, 0 to 3719\n", "Data columns (total 6 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Unnamed: 0 3720 non-null int64 \n", " 1 topic 3720 non-null string\n", " 2 discription 1748 non-null string\n", " 3 keyword 3720 non-null string\n", " 4 Links 3720 non-null object\n", " 5 level 3720 non-null string\n", "dtypes: int64(1), object(1), string(4)\n", "memory usage: 174.5+ KB\n" ] } ], "source": [ "data3['topic'] = data3.topic.astype(\"string\")\n", "data3['discription'] = data3.discription.astype(\"string\")\n", "data3['keyword'] = data3.keyword.astype(\"string\")\n", "data3['level'] = data3.level.astype(\"string\")\n", "data3.info()" ] }, { "cell_type": "markdown", "id": "64f90df1", "metadata": {}, "source": [ "# Data Cleaning Process\n", "'\n", "'\n" ] }, { "cell_type": "code", "execution_count": 10, "id": "b16989a3", "metadata": {}, "outputs": [], "source": [ "data3['tag'] = data3['discription'] + \" \" + data3['keyword'] +\" \" + data3['level']" ] }, { "cell_type": "code", "execution_count": 11, "id": "caa02729", "metadata": {}, "outputs": [], "source": [ "def remove_symbols(text):\n", " # Create a regular expression pattern to match unwanted symbols\n", " pattern = r'[^\\w\\s]' # Matches characters that are not alphanumeric or whitespace\n", " # Substitute matched symbols with an empty string\n", " return re.sub(pattern, '', text.lower()) " ] }, { "cell_type": "code", "execution_count": 12, "id": "a97fa574", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0topicdiscriptionkeywordLinksleveltag
00JavaJava is a general-purpose computer programming...Java, James Gosling, website, wikipedia, docum...website: https://oracle.com/java/, documentati...beginnertoadvancejava is a generalpurpose computer programming ...
11JavaScriptJavaScript (), often abbreviated as JS, is a h...JavaScript, Brendan Eich, reference, wikipedia...reference: https://www.w3schools.com/js/js_res...beginnertoadvancejavascript often abbreviated as js is a highl...
22CC (, as in the letter c) is a general-purpose,...C, Dennis Ritchie, reference, wikipedia, docum...reference: http://www.c4learn.com/c-programmin...beginnertoadvancec as in the letter c is a generalpurpose impe...
33PythonPython is a widely used high-level programming...Python, Guido van Rossum, website, reference, ...website: https://www.python.org/, reference: h...beginnertoadvancepython is a widely used highlevel programming ...
44SQLSQL ( ( listen) ESS-kew-EL or ( listen) SEE-k...SQL, Donald D. Chamberlin and Raymond F. Boyce...documentation: https://docs.data.world/documen...beginnertoadvancesql listen esskewel or listen seekwəl or ...
\n", "
" ], "text/plain": [ " Unnamed: 0 topic discription \\\n", "0 0 Java Java is a general-purpose computer programming... \n", "1 1 JavaScript JavaScript (), often abbreviated as JS, is a h... \n", "2 2 C C (, as in the letter c) is a general-purpose,... \n", "3 3 Python Python is a widely used high-level programming... \n", "4 4 SQL SQL ( ( listen) ESS-kew-EL or ( listen) SEE-k... \n", "\n", " keyword \\\n", "0 Java, James Gosling, website, wikipedia, docum... \n", "1 JavaScript, Brendan Eich, reference, wikipedia... \n", "2 C, Dennis Ritchie, reference, wikipedia, docum... \n", "3 Python, Guido van Rossum, website, reference, ... \n", "4 SQL, Donald D. Chamberlin and Raymond F. Boyce... \n", "\n", " Links level \\\n", "0 website: https://oracle.com/java/, documentati... beginnertoadvance \n", "1 reference: https://www.w3schools.com/js/js_res... beginnertoadvance \n", "2 reference: http://www.c4learn.com/c-programmin... beginnertoadvance \n", "3 website: https://www.python.org/, reference: h... beginnertoadvance \n", "4 documentation: https://docs.data.world/documen... beginnertoadvance \n", "\n", " tag \n", "0 java is a generalpurpose computer programming ... \n", "1 javascript often abbreviated as js is a highl... \n", "2 c as in the letter c is a generalpurpose impe... \n", "3 python is a widely used highlevel programming ... \n", "4 sql listen esskewel or listen seekwəl or ... " ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data3['tag'] = data3['tag'].fillna('')\n", "data3['tag'] = data3['tag'].apply(remove_symbols)\n", "data3['level'] = data3['level'].apply(lambda x: x.replace(\" \",\"\"))\n", "data3['keyword'] = data3['keyword'].fillna('')\n", "data3.head()" ] }, { "cell_type": "code", "execution_count": 13, "id": "a5a4f1ba", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'java is a generalpurpose computer programming language that is concurrent classbased objectoriented and specifically designed to have as few implementation dependencies as possible it is intended to let application developers write once run anywhere wora meaning that compiled java code can run on all platforms that support java without the need for recompilation java applications are typically compiled to bytecode that can run on any java virtual machine jvm regardless of computer architecture as of 2016 java is one of the most popular programming languages in use particularly for clientserver web applications with a reported 9 million developers java was originally developed by james gosling at sun microsystems which has since been acquired by oracle corporation and released in 1995 as a core component of sun microsystems java platform the language derives much of its syntax from c and c but it has fewer lowlevel facilities than either of them the original and reference implementation java compilers virtual machines and class libraries were originally released by sun under proprietary licenses as of may 2007 in compliance with the specifications of the java community process sun relicensed most of its java technologies under the gnu general public license others have also developed alternative implementations of these sun technologies such as the gnu compiler for java bytecode compiler gnu classpath standard libraries and icedteaweb browser plugin for applets the latest version is java 9 released on september 21 2017 and is one of the two versions currently supported for free by oracle versions earlier than java 8 are supported by companies on a commercial basis eg by oracle back to java 6 as of october 2017 while they still highly recommend that you uninstall prejava 8 from at least windows computers java james gosling website wikipedia document united states beginnertoadvance'" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data3['tag'][0]" ] }, { "cell_type": "markdown", "id": "efb5aaba", "metadata": {}, "source": [ "# Convert tag columns into vector " ] }, { "cell_type": "code", "execution_count": 14, "id": "86f2a927", "metadata": {}, "outputs": [], "source": [ "cv = CountVectorizer( max_features = 5000, stop_words = 'english')\n", "vector = cv.fit_transform(data3['tag']).toarray()" ] }, { "cell_type": "code", "execution_count": 15, "id": "b99539f9", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([0, 0, 0, ..., 0, 0, 0], dtype=int64)" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vector[0]" ] }, { "cell_type": "code", "execution_count": 16, "id": "6be0d7ec", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['10', '100', '1000', ..., 'λprolog', 'λx', 'μc'], dtype=object)" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cv.get_feature_names_out()" ] }, { "cell_type": "markdown", "id": "019ce68a", "metadata": {}, "source": [ "# Stemming And Lemmitization Process" ] }, { "cell_type": "code", "execution_count": 18, "id": "be45a6b8", "metadata": {}, "outputs": [], "source": [ "ps = PorterStemmer()" ] }, { "cell_type": "code", "execution_count": 30, "id": "3635f58c", "metadata": {}, "outputs": [], "source": [ "def preprocess_query(query):\n", " \n", " # Lowercase the query\n", " cleaned_query = query.lower()\n", "\n", " # Remove punctuation (adjust as needed)\n", " import string\n", " punctuation = string.punctuation\n", " cleaned_query = ''.join([char for char in cleaned_query if char not in punctuation])\n", "\n", " # Remove stop words (optional, replace with your stop word list)\n", " stop_words = [\"the\", \"a\", \"is\", \"in\", \"of\"]\n", " cleaned_query = ' '.join([word for word in cleaned_query.split() if word not in stop_words])\n", "\n", " # Stemming\n", " ps = PorterStemmer()\n", " cleaned_query = ' '.join([ps.stem(word) for word in cleaned_query.split()])\n", "\n", " # Lemmatization\n", " wnl = WordNetLemmatizer()\n", " cleaned_query = ' '.join([wnl.lemmatize(word) for word in cleaned_query.split()])\n", "\n", " return cleaned_query" ] }, { "cell_type": "code", "execution_count": 32, "id": "2787d4d3", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'talk'" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "preprocess_query('talked')" ] }, { "cell_type": "code", "execution_count": 31, "id": "6b8326d6", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'java jame gosl websit wikipedia document unit state beginnertoadv'" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "preprocess_query('java james gosling website wikipedia document united states beginnertoadvance')" ] }, { "cell_type": "code", "execution_count": 23, "id": "02ff3f52", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 java is a generalpurpos comput program languag...\n", "1 javascript often abbrevi as js is a highlevel ...\n", "2 c as in the letter c is a generalpurpos imper ...\n", "3 python is a wide use highlevel program languag...\n", "4 sql listen esskewel or listen seekwəl or skwee...\n", " ... \n", "3715 understandingtheprofessionaldataengineercertif...\n", "3716 atourofgooglecloudhandsonlab machinelearningen...\n", "3717 introductiontoaiandmachinelearningongoogleclou...\n", "3718 introductiontoaiandmachinelearningongoogleclou...\n", "3719 aifound machinelearningengineerlearningpathweb...\n", "Name: tag, Length: 3720, dtype: object" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data3['tag'].apply(stem) # apply on tag columns " ] }, { "cell_type": "markdown", "id": "66adf3fd", "metadata": {}, "source": [ "# Find Similarity score for finding most related topic from dataset" ] }, { "cell_type": "code", "execution_count": 24, "id": "33126518", "metadata": {}, "outputs": [], "source": [ "similar = cosine_similarity(vector)" ] }, { "cell_type": "code", "execution_count": 27, "id": "e1f7379a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[(1, 0.9999999999999998),\n", " (40, 0.4543441112511213),\n", " (350, 0.445852828483904),\n", " (134, 0.4049985302736412),\n", " (1485, 0.3754717312648463)]" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sorted(list(enumerate(similar[1])),reverse = True, key = lambda x: x[1])[0:5]" ] }, { "cell_type": "code", "execution_count": 29, "id": "084d898b", "metadata": {}, "outputs": [], "source": [ "summarizer = pipeline(\"summarization\", model=\"facebook/bart-base\")\n", "text_generator = pipeline(\"text-generation\", model=\"gpt2\")" ] }, { "cell_type": "code", "execution_count": 34, "id": "0197db1f", "metadata": {}, "outputs": [], "source": [ "documents = []\n", "for index, row in data3.iterrows():\n", " topic_description = preprocess_query(row[\"topic\"]) \n", " keywords = preprocess_query(row[\"keyword\"]) \n", " combined_text = f\"{topic_description} {keywords}\" # Combine for TF-IDF\n", " documents.append(combined_text)\n" ] }, { "cell_type": "code", "execution_count": 35, "id": "d80d5e6f", "metadata": {}, "outputs": [], "source": [ "# Create TF-IDF vectorizer\n", "vectorizer = TfidfVectorizer()\n", "\n", "# Fit the vectorizer on the documents\n", "document_vectors = vectorizer.fit_transform(documents)\n", "\n", "def recommend_from_dataset(query):\n", " \n", " cleaned_query = preprocess_query(query)\n", " query_vector = vectorizer.transform([cleaned_query])\n", "\n", " # Calculate cosine similarity between query and documents\n", " cosine_similarities = cosine_similarity(query_vector, document_vectors)\n", " similarity_scores = cosine_similarities.flatten()\n", "\n", " # Sort documents based on similarity scores\n", " sorted_results = sorted(zip(similarity_scores, data3.index, range(len(documents))), reverse=True)\n", "\n", " # Return top N recommendations with scores, topic names, and links (if available)\n", " top_n_results = sorted_results[:5] \n", " recommendations = []\n", " for result in top_n_results:\n", " score = result[0]\n", " document_id = result[1]\n", " topic_name = data3.loc[document_id, \"topic\"] \n", " link = data3.loc[document_id, \"Links\"] if \"Links\" in data3.columns else \"No link available\" \n", " if score >= 0.3:\n", " recommendations.append({\"topic_name\": topic_name, \"link\": link, \"score\": score})\n", " return recommendations\n" ] }, { "cell_type": "code", "execution_count": 36, "id": "e56ccfc2", "metadata": {}, "outputs": [], "source": [ "def fine_tune_model(model_name, train_dataset, validation_dataset, epochs=3):\n", " # Load model and tokenizer\n", " model = AutoModelForSeq2SeqLM.from_pretrained(model_name)\n", " tokenizer = AutoTokenizer.from_pretrained(model_name)\n", "\n", " # Define training arguments (adjust parameters as needed)\n", " training_args = TrainingArguments(\n", " output_dir=\"./results\", # Adjust output directory\n", " per_device_train_batch_size=8,\n", " per_device_eval_batch_size=8,\n", " num_train_epochs=epochs,\n", " save_steps=10_000,\n", " )\n", "\n", " # Create a Trainer instance for fine-tuning\n", " trainer = Trainer(\n", " model=model,\n", " args=training_args,\n", " train_dataset=train_dataset,\n", " eval_dataset=validation_dataset,\n", " tokenizer=tokenizer,\n", " )\n", "\n", " # Train the model\n", " trainer.train()\n", "\n", " return model" ] }, { "cell_type": "code", "execution_count": 39, "id": "9c1c02c9", "metadata": {}, "outputs": [], "source": [ "train_dataset = # Prepare your training dataset\n", "validation_dataset = ... # Prepare your validation dataset\n", "\n", "# Fine-tune the model (replace model name if needed)\n", "fine_tuned_model = fine_tune_model(\"facebook/bart-base\", train_dataset, validation_dataset)\n", "\n", "# Update summarization pipeline with the fine-tuned model\n", "summarizer1 = pipeline(\"text-generation\", model=fine_tuned_model, tokenizer=fine_tuned_model.tokenizer)\n" ] }, { "cell_type": "code", "execution_count": 45, "id": "49baeaf5", "metadata": {}, "outputs": [], "source": [ "def summarize_and_generate(user_query, recommendations):\n", " \n", " # Summarize the user query\n", " query_summary = summarizer(user_query, max_length=100, truncation=True)[0][\"summary_text\"]\n", "\n", " # Generate creative text related to the query\n", " generated_text = text_generator(f\"Exploring the concept of {user_query}\", max_length=100, num_return_sequences=1)[0][\"generated_text\"]\n", "\n", " # Extract related links with scores\n", " related_links = []\n", " for recommendation in recommendations:\n", " related_links.append({\"topic\": recommendation[\"topic_name\"], \"link\": recommendation[\"link\"], \"score\": recommendation[\"score\"]})\n", "\n", " return {\n", " \"query_summary\": query_summary.strip(),\n", " \"generated_text\": generated_text.strip(),\n", " \"related_links\": related_links\n", " }" ] }, { "cell_type": "code", "execution_count": 46, "id": "fb9e58cc", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Your max_length is set to 100, but you input_length is only 9. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=4)\n", "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Query Summary: java by james goslinjames groslin\n", "Creative Text: Exploring the concept of java by james goslin is an impressive effort at the best of times and I'm very impressed by how well this was done. The code looks quite simple for simple purposes — there are only two basic methods, call() and destroy(). These two methods are used by most of the java libraries, so any Java that relies on call() or destroy() should use a proper method of your choice as well. Also, the code uses a single method, so that\n", "Some Related Links for your query:\n", "- Java:\n", " website: https://oracle.com/java/, documentation: https://docs.oracle.com/en/java/, wikipedia: https://en.wikipedia.org/wiki/Java_(programming_language) : \n", " Score: 0.625462748622542\n", "- Java Properties:\n", " wikipedia: https://en.wikipedia.org/wiki/.properties : \n", " Score: 0.3952596829701199\n", "- Java Bytecode:\n", " documentation: https://docs.oracle.com/javase/specs/jvms/se7/html/, wikipedia: https://en.wikipedia.org/wiki/Java_bytecode : \n", " Score: 0.38255306128391625\n", "- Query by Example:\n", " reference: https://semanticscholar.org/paper/f320e453ae65ddf0a3789f4383fa164481c7a8b3, wikipedia: https://en.wikipedia.org/wiki/Query_by_Example : \n", " Score: 0.3726562653850712\n", "- Join Java:\n", " wikipedia: https://en.wikipedia.org/wiki/Join_Java : \n", " Score: 0.3143513411797295\n" ] } ], "source": [ "user_query = \"java by james goslin\"\n", "recommendations = recommend_from_dataset(user_query)\n", "\n", "# Get the summary, generated text, and related links\n", "results = summarize_and_generate(user_query, recommendations)\n", "\n", "print(f\"Query Summary: {results['query_summary']}\")\n", "print(f\"Creative Text: {results['generated_text']}\")\n", "print(\"Some Related Links for your query:\")\n", "for link in results[\"related_links\"]:\n", " print(f\"- {link['topic']}:\\n {link['link']} : \\n Score: {link['score']}\") #(Score: {link['score']})" ] }, { "cell_type": "code", "execution_count": null, "id": "46535752", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.9" } }, "nbformat": 4, "nbformat_minor": 5 }