Spaces:

yonkasoft
/

makaleChatbotu

Build error

App Files Files Community

yonkasoft commited on Aug 27, 2024

Commit

4c37d0f

verified ·

1 Parent(s): 4a9831d

Upload 3 files

Browse files

Files changed (4) hide show

.gitattributes +2 -0
cleaned_data3.csv +3 -0
cleaned_processed_data_sample.csv +0 -0
get_text.ipynb +385 -0

.gitattributes CHANGED Viewed

@@ -37,3 +37,5 @@ combined_output.csv filter=lfs diff=lfs merge=lfs -text
 combined_texts.csv filter=lfs diff=lfs merge=lfs -text
 processed_data.csv filter=lfs diff=lfs merge=lfs -text
 cleaned_processed_data.csv filter=lfs diff=lfs merge=lfs -text

 combined_texts.csv filter=lfs diff=lfs merge=lfs -text
 processed_data.csv filter=lfs diff=lfs merge=lfs -text
 cleaned_processed_data.csv filter=lfs diff=lfs merge=lfs -text
+cleaned_data3.csv filter=lfs diff=lfs merge=lfs -text
+cleaned_processed_data_sample.csv filter=lfs diff=lfs merge=lfs -text

cleaned_data3.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ce6e95de4b9027b056fcd106de49fa4bcdcf6492c3a77d70970c7a049fdc0088
+size 367048270

cleaned_processed_data_sample.csv CHANGED Viewed

The diff for this file is too large to render. See raw diff

get_text.ipynb ADDED Viewed

	@@ -0,0 +1,385 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import load_dataset\n",
+    "import pandas as pd \n",
+    "from pymongo import MongoClient\n",
+    "from transformers import BertTokenizer, BertForMaskedLM, DPRContextEncoderTokenizer,DPRContextEncoder;\n",
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "import numpy as np\n",
+    "import re\n",
+    "import pandas as pd\n",
+    "from nltk.stem import WordNetLemmatizer\n",
+    "from nltk.corpus import stopwords as nltk_stopwords\n",
+    "from transformers import BertTokenizer, BertModel, AutoTokenizer\n",
+    "from sklearn.metrics.pairwise import cosine_similarity\n",
+    "import torch\n",
+    "from pymongo import MongoClient\n",
+    "import torch.nn.functional as F"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Kısaltılmış metinler:\n",
+      "0    Alman tarihçileri Alman sosyologlar Alman devr...\n",
+      "1    Diskografi : Seferberlik Türküleri Kuvayi Mill...\n",
+      "2    Modern bilgisayarlar Ayrıca Bilgisayarlar Bilg...\n",
+      "3                             Ayrıca Kaynakça Edebiyat\n",
+      "4    Ayrıca Mühendislik Mühendislik dalları Mühendi...\n",
+      "Name: kısaltılmıs_metin, dtype: object\n",
+      "Tokenize edilmiş ve padding uygulanmış veriler:\n",
+      "                                   kısaltılmıs_metin  \\\n",
+      "0  Alman tarihçileri Alman sosyologlar Alman devr...   \n",
+      "1  Diskografi : Seferberlik Türküleri Kuvayi Mill...   \n",
+      "2  Modern bilgisayarlar Ayrıca Bilgisayarlar Bilg...   \n",
+      "3                           Ayrıca Kaynakça Edebiyat   \n",
+      "4  Ayrıca Mühendislik Mühendislik dalları Mühendi...   \n",
+      "\n",
+      "                                       padded_tokens  \n",
+      "0  [2, 3651, 2465, 10576, 3651, 23906, 7131, 1980...  \n",
+      "1  [2, 28488, 12922, 30, 17749, 3251, 2102, 22548...  \n",
+      "2  [2, 11368, 26726, 3401, 7682, 1980, 7682, 7682...  \n",
+      "3  [2, 3401, 7934, 2548, 8558, 3, 0, 0, 0, 0, 0, ...  \n",
+      "4  [2, 3401, 13858, 13858, 31737, 13858, 13858, 2...  \n",
+      "Temizlenmiş veri 'cleaned_data3.csv' dosyasına kaydedildi.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'cleaned_data3.csv'"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "import re\n",
+    "from nltk.corpus import stopwords\n",
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "#------------------------cümlelerin boyutlarını ve stop wordsleri tanımladığımız yer -----------------------------\n",
+    "import spacy\n",
+    "from spacy.lang.tr import Turkish\n",
+    "\n",
+    "nlp = Turkish()\n",
+    "\n",
+    "def truncate_text_meaningful(text, max_len=300):\n",
+    "    doc = nlp(text)\n",
+    "\n",
+    "    # Stop kelimeleri ve noktalama işaretlerini kaldır\n",
+    "    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]\n",
+    "\n",
+    "    # Named Entity Recognition (isteğe bağlı)\n",
+    "    # for ent in doc.ents:\n",
+    "    #     print(ent.text, ent.label_)\n",
+    "\n",
+    "    # Belirli bir uzunluktaki metni döndür\n",
+    "    truncated_text = ' '.join(tokens[:max_len])\n",
+    "\n",
+    "    return truncated_text\n",
+    "        \n",
+    "\n",
+    "    \n",
+    "\n",
+    "  \n",
+    "\n",
+    "#----------------------------------tokenize etme fonksiyonu-----------------------------------\n",
+    "def tokenize_and_pad(data, model_name='bert-base-uncased', max_length=512):\n",
+    "    tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
+    "    encoded_input = tokenizer(data, padding=True, truncation=True, max_length=max_length)\n",
+    "    return encoded_input\n",
+    "\n",
+    "class DataProcessor:\n",
+    "    def __init__(self, input_csv, output_csv, max_words=300, model_name='dbmdz/distilbert-base-turkish-cased'):\n",
+    "        self.input_csv = input_csv\n",
+    "        self.output_csv = output_csv\n",
+    "        self.max_words = max_words\n",
+    "        self.model_name = model_name\n",
+    "\n",
+    "    def main_pipeline(self):\n",
+    "\n",
+    "        def filter_text(text):\n",
+    "            # Dış bağlantılar ve kaynakçaları kaldır\n",
+    "            text = re.sub(r'http\\S+|https\\S+|\\b(?:www\\.)?\\S+\\.\\w{2,4}\\b', '', text)\n",
+    "            # Tarih ve sayıları kaldır\n",
+    "            text = re.sub(r'\\d{4}-\\d{2}-\\d{2}|\\d{2}/\\d{2}/\\d{4}|\\d+', '', text)  # Tari\n",
+    "            # Sayıları kaldır\n",
+    "            text = re.sub(r'\\d+', '', text)\n",
+    "            # Kısa veya uzun kelimeleri kaldır\n",
+    "            words = text.split()\n",
+    "            words = [word for word in words if 2 <= len(word) <= 20]\n",
+    "            return ' '.join(words)\n",
+    "        \n",
+    "        df = pd.read_csv(self.input_csv)\n",
+    "        df['kısaltılmıs_metin'] = df['metinler'].apply(filter_text)\n",
+    "\n",
+    "        \n",
+    "\n",
+    "        # Metinleri kısalt\n",
+    "        df['kısaltılmıs_metin'] = df['metinler'].apply(lambda x: truncate_text_meaningful(x, max_len=self.max_words))\n",
+    "        padded_tokens = tokenize_and_pad(df['kısaltılmıs_metin'].tolist(), model_name=self.model_name)\n",
+    "        df['padded_tokens'] = padded_tokens['input_ids']\n",
+    "        print(\"Kısaltılmış metinler:\")\n",
+    "        print(df['kısaltılmıs_metin'].head())\n",
+    "        print(\"Tokenize edilmiş ve padding uygulanmış veriler:\")\n",
+    "        print(df[['kısaltılmıs_metin', 'padded_tokens']].head())\n",
+    "        \n",
+    "        \n",
+    "\n",
+    "        # Veriyi kaydet\n",
+    "        self.save_cleaned_data(df)\n",
+    "    \n",
+    "        return self.output_csv\n",
+    "        \n",
+    "\n",
+    "    def save_cleaned_data(self, df):\n",
+    "        df.to_csv(self.output_csv, index=False)\n",
+    "        print(f\"Temizlenmiş veri '{self.output_csv}' dosyasına kaydedildi.\")\n",
+    "\n",
+    "#---------------------------------Verilerin kaydedilmesi-------------------------------------\n",
+    "\n",
+    "processor = DataProcessor(input_csv=\"texts_egitim.csv\", output_csv=\"cleaned_data3.csv\")\n",
+    "processor.main_pipeline()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "                                        kısaltılmıs_metin\n",
+      "0       Alman tarihçileri Alman sosyologlar Alman devr...\n",
+      "1       Diskografi : Seferberlik Türküleri Kuvayi Mill...\n",
+      "2       Modern bilgisayarlar Ayrıca Bilgisayarlar Bilg...\n",
+      "3                                Ayrıca Kaynakça Edebiyat\n",
+      "4       Ayrıca Mühendislik Mühendislik dalları Mühendi...\n",
+      "...                                                   ...\n",
+      "104103               Dış bağlantılar Kaynakça Cicerininae\n",
+      "104104               Dış bağlantılar Kaynakça Cicerininae\n",
+      "104105                                   Lig futbolcuları\n",
+      "104106           Dış bağlantılar Kaynakça Kalyptorhynchia\n",
+      "104107  Dış bağlantılar 'de Japonya'da oluşumlar 'de b...\n",
+      "\n",
+      "[104108 rows x 1 columns]\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>kısaltılmıs_metin</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Alman tarihçileri Alman sosyologlar Alman devr...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Diskografi : Seferberlik Türküleri Kuvayi Mill...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Modern bilgisayarlar Ayrıca Bilgisayarlar Bilg...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Ayrıca Kaynakça Edebiyat</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Ayrıca Mühendislik Mühendislik dalları Mühendi...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>104103</th>\n",
+       "      <td>Dış bağlantılar Kaynakça Cicerininae</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>104104</th>\n",
+       "      <td>Dış bağlantılar Kaynakça Cicerininae</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>104105</th>\n",
+       "      <td>Lig futbolcuları</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>104106</th>\n",
+       "      <td>Dış bağlantılar Kaynakça Kalyptorhynchia</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>104107</th>\n",
+       "      <td>Dış bağlantılar 'de Japonya'da oluşumlar 'de b...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>104108 rows × 1 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                        kısaltılmıs_metin\n",
+       "0       Alman tarihçileri Alman sosyologlar Alman devr...\n",
+       "1       Diskografi : Seferberlik Türküleri Kuvayi Mill...\n",
+       "2       Modern bilgisayarlar Ayrıca Bilgisayarlar Bilg...\n",
+       "3                                Ayrıca Kaynakça Edebiyat\n",
+       "4       Ayrıca Mühendislik Mühendislik dalları Mühendi...\n",
+       "...                                                   ...\n",
+       "104103               Dış bağlantılar Kaynakça Cicerininae\n",
+       "104104               Dış bağlantılar Kaynakça Cicerininae\n",
+       "104105                                   Lig futbolcuları\n",
+       "104106           Dış bağlantılar Kaynakça Kalyptorhynchia\n",
+       "104107  Dış bağlantılar 'de Japonya'da oluşumlar 'de b...\n",
+       "\n",
+       "[104108 rows x 1 columns]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "# CSV dosyasını oku\n",
+    "df = pd.read_csv('cleaned_data3.csv')\n",
+    "\n",
+    "# Görmek istediğiniz üç sütunu seçin\n",
+    "selected_columns = df[['kısaltılmıs_metin']]\n",
+    "\n",
+    "# Seçilen sütunları tablo olarak görüntüle\n",
+    "print(selected_columns)\n",
+    "\n",
+    "# Eğer Jupyter Notebook kullanıyorsanız, daha güzel görüntü için display() fonksiyonunu kullanabilirsiniz:\n",
+    "from IPython.display import display\n",
+    "display(selected_columns)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "AttributeError",
+     "evalue": "'float' object has no attribute 'split'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[10], line 12\u001b[0m\n\u001b[0;32m      9\u001b[0m df_sample \u001b[38;5;241m=\u001b[39m df\u001b[38;5;241m.\u001b[39msample(n\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m10000\u001b[39m, random_state\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m100\u001b[39m)\n\u001b[0;32m     11\u001b[0m \u001b[38;5;66;03m# Kelimeleri token'lara ayırma\u001b[39;00m\n\u001b[1;32m---> 12\u001b[0m tokenized_text \u001b[38;5;241m=\u001b[39m [text\u001b[38;5;241m.\u001b[39msplit() \u001b[38;5;28;01mfor\u001b[39;00m text \u001b[38;5;129;01min\u001b[39;00m df_sample[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mkısaltılmıs_metin\u001b[39m\u001b[38;5;124m'\u001b[39m]]\n\u001b[0;32m     14\u001b[0m \u001b[38;5;66;03m# Dictionary ve Corpus oluşturma\u001b[39;00m\n\u001b[0;32m     15\u001b[0m id2word \u001b[38;5;241m=\u001b[39m corpora\u001b[38;5;241m.\u001b[39mDictionary(tokenized_text)\n",
+      "Cell \u001b[1;32mIn[10], line 12\u001b[0m, in \u001b[0;36m<listcomp>\u001b[1;34m(.0)\u001b[0m\n\u001b[0;32m      9\u001b[0m df_sample \u001b[38;5;241m=\u001b[39m df\u001b[38;5;241m.\u001b[39msample(n\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m10000\u001b[39m, random_state\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m100\u001b[39m)\n\u001b[0;32m     11\u001b[0m \u001b[38;5;66;03m# Kelimeleri token'lara ayırma\u001b[39;00m\n\u001b[1;32m---> 12\u001b[0m tokenized_text \u001b[38;5;241m=\u001b[39m [\u001b[43mtext\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msplit\u001b[49m() \u001b[38;5;28;01mfor\u001b[39;00m text \u001b[38;5;129;01min\u001b[39;00m df_sample[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mkısaltılmıs_metin\u001b[39m\u001b[38;5;124m'\u001b[39m]]\n\u001b[0;32m     14\u001b[0m \u001b[38;5;66;03m# Dictionary ve Corpus oluşturma\u001b[39;00m\n\u001b[0;32m     15\u001b[0m id2word \u001b[38;5;241m=\u001b[39m corpora\u001b[38;5;241m.\u001b[39mDictionary(tokenized_text)\n",
+      "\u001b[1;31mAttributeError\u001b[0m: 'float' object has no attribute 'split'"
+     ]
+    }
+   ],
+   "source": [
+    "from gensim import corpora\n",
+    "from gensim.models import LdaMulticore\n",
+    "import pandas as pd\n",
+    "\n",
+    "# CSV dosyasını okuma\n",
+    "df = pd.read_csv('cleaned_data3.csv')\n",
+    "\n",
+    "# Verinin bir alt kümesini seçme\n",
+    "df_sample = df.sample(n=10000, random_state=100)\n",
+    "\n",
+    "# Kelimeleri token'lara ayırma\n",
+    "tokenized_text = [text.split() for text in df_sample['kısaltılmıs_metin']]\n",
+    "\n",
+    "# Dictionary ve Corpus oluşturma\n",
+    "id2word = corpora.Dictionary(tokenized_text)\n",
+    "corpus = [id2word.doc2bow(text) for text in tokenized_text]\n",
+    "\n",
+    "# LDA Modelini Eğitme\n",
+    "lda_model = LdaMulticore(\n",
+    "    corpus=corpus,\n",
+    "    id2word=id2word,\n",
+    "    num_topics=5,\n",
+    "    random_state=100,\n",
+    "    chunksize=50,\n",
+    "    passes=5,\n",
+    "    alpha='symmetric',\n",
+    "    eta='auto',\n",
+    "    per_word_topics=True,\n",
+    "    workers=4  # Paralel iş parçacıkları kullanarak performansı artırır\n",
+    ")\n",
+    "\n",
+    "# Sonuçları görüntüleme\n",
+    "for idx, topic in lda_model.print_topics(-1):\n",
+    "    print(f\"Topic: {idx}\\nWords: {topic}\\n\")\n",
+    "\n",
+    "# Alt kümesini kaydetme\n",
+    "df_sample.to_csv('cleaned_processed_data_sample.csv', index=False)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}