Spaces:

yonkasoft
/

makaleChatbotu

Build error

File size: 12,933 Bytes

b9ab29b

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "ename": "ModuleNotFoundError",
     "evalue": "No module named 'bs4'",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
      "Cell \u001b[1;32mIn[2], line 6\u001b[0m\n\u001b[0;32m      3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpymongo\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m MongoClient\n\u001b[0;32m      5\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mrequests\u001b[39;00m\n\u001b[1;32m----> 6\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mbs4\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m BeautifulSoup\n",
      "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'bs4'"
     ]
    }
   ],
   "source": [
    "import csv\n",
    "import pandas as pd \n",
    "from pymongo import MongoClient\n",
    "\n",
    "import requests\n",
    "from bs4 import BeautifulSoup\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Connect to MongoDB\n",
    "client = MongoClient(\"mongodb://localhost:27017/\")\n",
    "db = client[\"myDatabase\"]\n",
    "source_collection = db[\"data\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Export translated data to a CSV file #bu dosyayı json olarak indirdim\n",
    "\"\"\"yeni_data = list(source_collection.find())\n",
    "print(yeni_data)\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                                    _id                             title  \\\n",
      "0  {'$oid': '66a1020f29abc84d21689044'}               Mental Note Vol. 24   \n",
      "1  {'$oid': '66a1020f29abc84d21689045'}         Your Brain On Coronavirus   \n",
      "2  {'$oid': '66a1020f29abc84d21689046'}                    Mind Your Nose   \n",
      "3  {'$oid': '66a1020f29abc84d21689047'}          The 4 Purposes of Dreams   \n",
      "4  {'$oid': '66a1020f29abc84d21689048'}  Surviving a Rod Through the Head   \n",
      "\n",
      "                                                 url                 authors  \\\n",
      "0  https://medium.com/invisible-illness/mental-no...            ['Ryan Fan']   \n",
      "1  https://medium.com/age-of-awareness/how-the-pa...       ['Simon Spichak']   \n",
      "2  https://medium.com/neodotlife/mind-your-nose-f...                      []   \n",
      "3  https://medium.com/science-for-real/the-4-purp...  ['Eshan Samaranayake']   \n",
      "4  https://medium.com/live-your-life-on-purpose/s...        ['Rishav Sinha']   \n",
      "\n",
      "                          timestamp  \\\n",
      "0  2020-12-26 03:38:10.479000+00:00   \n",
      "1  2020-09-23 22:10:17.126000+00:00   \n",
      "2  2020-10-10 20:17:37.132000+00:00   \n",
      "3  2020-12-21 16:05:19.524000+00:00   \n",
      "4  2020-02-26 00:01:01.576000+00:00   \n",
      "\n",
      "                                                tags  \n",
      "0  ['Mental Health', 'Health', 'Psychology', 'Sci...  \n",
      "1  ['Mental Health', 'Coronavirus', 'Science', 'P...  \n",
      "2  ['Biotechnology', 'Neuroscience', 'Brain', 'We...  \n",
      "3  ['Health', 'Neuroscience', 'Mental Health', 'P...  \n",
      "4  ['Brain', 'Health', 'Development', 'Psychology...  \n"
     ]
    }
   ],
   "source": [
    "#csv dosyası olarak yüklenmesi\n",
    "df=pd.read_json('myDatabase.data.json')\n",
    "print(df.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>_id</th>\n",
       "      <th>title</th>\n",
       "      <th>url</th>\n",
       "      <th>authors</th>\n",
       "      <th>timestamp</th>\n",
       "      <th>tags</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>{'$oid': '66a1020f29abc84d21689044'}</td>\n",
       "      <td>Mental Note Vol. 24</td>\n",
       "      <td>https://medium.com/invisible-illness/mental-no...</td>\n",
       "      <td>['Ryan Fan']</td>\n",
       "      <td>2020-12-26 03:38:10.479000+00:00</td>\n",
       "      <td>['Mental Health', 'Health', 'Psychology', 'Sci...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>{'$oid': '66a1020f29abc84d21689045'}</td>\n",
       "      <td>Your Brain On Coronavirus</td>\n",
       "      <td>https://medium.com/age-of-awareness/how-the-pa...</td>\n",
       "      <td>['Simon Spichak']</td>\n",
       "      <td>2020-09-23 22:10:17.126000+00:00</td>\n",
       "      <td>['Mental Health', 'Coronavirus', 'Science', 'P...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>{'$oid': '66a1020f29abc84d21689046'}</td>\n",
       "      <td>Mind Your Nose</td>\n",
       "      <td>https://medium.com/neodotlife/mind-your-nose-f...</td>\n",
       "      <td>[]</td>\n",
       "      <td>2020-10-10 20:17:37.132000+00:00</td>\n",
       "      <td>['Biotechnology', 'Neuroscience', 'Brain', 'We...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>{'$oid': '66a1020f29abc84d21689047'}</td>\n",
       "      <td>The 4 Purposes of Dreams</td>\n",
       "      <td>https://medium.com/science-for-real/the-4-purp...</td>\n",
       "      <td>['Eshan Samaranayake']</td>\n",
       "      <td>2020-12-21 16:05:19.524000+00:00</td>\n",
       "      <td>['Health', 'Neuroscience', 'Mental Health', 'P...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>{'$oid': '66a1020f29abc84d21689048'}</td>\n",
       "      <td>Surviving a Rod Through the Head</td>\n",
       "      <td>https://medium.com/live-your-life-on-purpose/s...</td>\n",
       "      <td>['Rishav Sinha']</td>\n",
       "      <td>2020-02-26 00:01:01.576000+00:00</td>\n",
       "      <td>['Brain', 'Health', 'Development', 'Psychology...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                    _id                             title  \\\n",
       "0  {'$oid': '66a1020f29abc84d21689044'}               Mental Note Vol. 24   \n",
       "1  {'$oid': '66a1020f29abc84d21689045'}         Your Brain On Coronavirus   \n",
       "2  {'$oid': '66a1020f29abc84d21689046'}                    Mind Your Nose   \n",
       "3  {'$oid': '66a1020f29abc84d21689047'}          The 4 Purposes of Dreams   \n",
       "4  {'$oid': '66a1020f29abc84d21689048'}  Surviving a Rod Through the Head   \n",
       "\n",
       "                                                 url                 authors  \\\n",
       "0  https://medium.com/invisible-illness/mental-no...            ['Ryan Fan']   \n",
       "1  https://medium.com/age-of-awareness/how-the-pa...       ['Simon Spichak']   \n",
       "2  https://medium.com/neodotlife/mind-your-nose-f...                      []   \n",
       "3  https://medium.com/science-for-real/the-4-purp...  ['Eshan Samaranayake']   \n",
       "4  https://medium.com/live-your-life-on-purpose/s...        ['Rishav Sinha']   \n",
       "\n",
       "                          timestamp  \\\n",
       "0  2020-12-26 03:38:10.479000+00:00   \n",
       "1  2020-09-23 22:10:17.126000+00:00   \n",
       "2  2020-10-10 20:17:37.132000+00:00   \n",
       "3  2020-12-21 16:05:19.524000+00:00   \n",
       "4  2020-02-26 00:01:01.576000+00:00   \n",
       "\n",
       "                                                tags  \n",
       "0  ['Mental Health', 'Health', 'Psychology', 'Sci...  \n",
       "1  ['Mental Health', 'Coronavirus', 'Science', 'P...  \n",
       "2  ['Biotechnology', 'Neuroscience', 'Brain', 'We...  \n",
       "3  ['Health', 'Neuroscience', 'Mental Health', 'P...  \n",
       "4  ['Brain', 'Health', 'Development', 'Psychology...  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "_id          object\n",
       "title        object\n",
       "url          object\n",
       "authors      object\n",
       "timestamp    object\n",
       "tags         object\n",
       "dtype: object"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0    <class 'dict'>\n",
      "Name: _id, dtype: object\n",
      "0    <class 'str'>\n",
      "Name: title, dtype: object\n",
      "0    <class 'str'>\n",
      "Name: url, dtype: object\n",
      "0    <class 'str'>\n",
      "Name: authors, dtype: object\n",
      "0    <class 'str'>\n",
      "Name: timestamp, dtype: object\n",
      "0    <class 'str'>\n",
      "Name: tags, dtype: object\n"
     ]
    }
   ],
   "source": [
    "for i in df.columns:\n",
    "    print(df[i].apply(lambda x:type(x)).head(1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "#içeriklerin saklanacağı bir liste oluştrun\n",
    "contents=[]\n",
    "#her url için içeriği çekin \n",
    "\n",
    "for url in df['url']:\n",
    "    try:\n",
    "        response=requests.get(url)\n",
    "        soup=BeautifulSoup(response.content,'html.parser')\n",
    "\n",
    "        #medium içeriğini çekmek için uygun seçiciyi kullanın\n",
    "        article_content=soup.find('articles')\n",
    "        content=article_content.get_text(separator='') if article_content else 'content not found'\n",
    "\n",
    "        contents.append(content)\n",
    "    except Exception as e:\n",
    "        contents.append(f'error retrieving content: {e}')\n",
    "\n",
    "#içerikleri veri çerçevesine ekleyin.\n",
    "df['content']= contents\n",
    "\n",
    "#yeni veri kümesini kontrol edin\n",
    "print(df.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "#modeleğitimi için test valid değerleriğ oluşturma \n",
    "\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "X_train, X_val, y_train, y_val = train_test_split(translated_data, translated_data, test_size=0.2, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.svm import SVC\n",
    "\n",
    "vectorizer = TfidfVectorizer()\n",
    "X_train_transformed = vectorizer.fit_transform(X_train)\n",
    "X_val_transformed = vectorizer.transform(X_val)\n",
    "\n",
    "model = SVC()\n",
    "model.fit(X_train_transformed, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.metrics import accuracy_score\n",
    "\n",
    "y_pred = model.predict(X_val_transformed)\n",
    "accuracy = accuracy_score(y_val, y_pred)\n",
    "print(f\"Accuracy: {accuracy:.2f}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "myenv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}