yonkasoft commited on
Commit
2c996b2
1 Parent(s): 201583f

Upload datasets.ipynb

Browse files
Files changed (1) hide show
  1. datasets.ipynb +582 -94
datasets.ipynb CHANGED
@@ -9,18 +9,9 @@
9
  },
10
  {
11
  "cell_type": "code",
12
- "execution_count": 1,
13
  "metadata": {},
14
- "outputs": [
15
- {
16
- "name": "stderr",
17
- "output_type": "stream",
18
- "text": [
19
- "c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
20
- " from .autonotebook import tqdm as notebook_tqdm\n"
21
- ]
22
- }
23
- ],
24
  "source": [
25
  "from datasets import load_dataset\n",
26
  "import pandas as pd \n",
@@ -39,7 +30,7 @@
39
  },
40
  {
41
  "cell_type": "code",
42
- "execution_count": 3,
43
  "metadata": {},
44
  "outputs": [],
45
  "source": [
@@ -50,7 +41,7 @@
50
  },
51
  {
52
  "cell_type": "code",
53
- "execution_count": 4,
54
  "metadata": {},
55
  "outputs": [],
56
  "source": [
@@ -60,7 +51,7 @@
60
  },
61
  {
62
  "cell_type": "code",
63
- "execution_count": 5,
64
  "metadata": {},
65
  "outputs": [],
66
  "source": [
@@ -71,7 +62,7 @@
71
  },
72
  {
73
  "cell_type": "code",
74
- "execution_count": 6,
75
  "metadata": {},
76
  "outputs": [],
77
  "source": [
@@ -101,7 +92,7 @@
101
  },
102
  {
103
  "cell_type": "code",
104
- "execution_count": 7,
105
  "metadata": {},
106
  "outputs": [
107
  {
@@ -151,7 +142,7 @@
151
  },
152
  {
153
  "cell_type": "code",
154
- "execution_count": 7,
155
  "metadata": {},
156
  "outputs": [
157
  {
@@ -206,12 +197,96 @@
206
  " return train_collection,test_collection\n",
207
  "\n",
208
  "# Train ve test datasetlerini MongoDB'ye yüklemek için fonksiyonu çağır\n",
209
- "train_file_path = 'C:\\\\gitProjects\\\\bert\\\\datasets\\\\train_Egitim\\\\merged_train.parquet'\n",
210
- "test_file_path = 'C:\\\\gitProjects\\\\bert\\\\datasets\\\\test_Egitim\\\\merged_test.parquet'\n",
211
  "\n",
212
  "train_collection, test_collection = dataset_read(train_file_path, test_file_path)"
213
  ]
214
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
  {
216
  "cell_type": "markdown",
217
  "metadata": {},
@@ -225,80 +300,466 @@
225
  "metadata": {},
226
  "outputs": [],
227
  "source": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
228
  "from sklearn.feature_extraction.text import TfidfVectorizer\n",
229
- "from sentence_transformers import SentenceTransformer\n",
 
 
 
 
 
 
 
230
  "\n",
231
- "#bert base modeli \n",
232
- "model = SentenceTransformer(\"emrecan/bert-base-turkish-cased-mean-nli-stsb-tr\")\n",
 
 
 
 
 
 
233
  "\n",
234
- "#text dosyasını koleksiyon üzerinden çekme \n",
235
- "# Database sınıfı: Veritabanı bağlantıları ve verileri çekme işlevleri\n",
236
- "# Database sınıfı: Veritabanı bağlantıları ve verileri çekme işlevleri\n",
237
  "class Database:\n",
238
  " @staticmethod\n",
239
  " def get_mongodb():\n",
240
- " # MongoDB bağlantı bilgilerini döndürecek şekilde tanımlanmıştır.\n",
241
- " return 'mongodb://localhost:27017/', 'EgitimDatabase', 'train'\n",
242
  "\n",
 
243
  " @staticmethod\n",
244
- " def get_input_titles():\n",
245
  " mongo_url, db_name, collection_name = Database.get_mongodb()\n",
246
  " client = MongoClient(mongo_url)\n",
247
  " db = client[db_name]\n",
248
  " collection = db[collection_name]\n",
249
- " query = {\"title\": {\"$exists\": True}}\n",
250
- " cursor = collection.find(query, {\"title\": 1, \"_id\": 0})\n",
251
- " # Başlıkları listeye aldık\n",
252
- " title_from_db = [doc['title'] for doc in cursor]\n",
253
- " title_count = len(title_from_db)\n",
254
- " return title_from_db, title_count\n",
255
  " \n",
 
256
  " @staticmethod\n",
257
- " def get_input_texts():\n",
258
- " mongo_url, db_name, collection_name = Database.get_mongodb()\n",
259
- " client = MongoClient(mongo_url)\n",
260
- " db = client[db_name]\n",
261
- " collection = db[collection_name]\n",
262
- " query = {\"text\": {\"$exists\": True}}\n",
263
- " cursor = collection.find(query, {\"text\": 1, \"_id\": 0})\n",
264
- " text_from_db = [doc['text'] for doc in cursor]\n",
265
- " text_count= len(text_from_db)\n",
266
- " return text_from_db,text_count\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
267
  "\n",
268
  "\n",
269
- "# Veritabanından başlıklar ve metinler alınır\n",
270
- "titles, title_count = Database.get_input_titles()\n",
271
- "texts = Database.get_input_texts()\n",
 
272
  "\n",
273
- "#sonuçların belirlenmesi\n",
274
- "documents = titles + texts\n",
275
- "print(f\"Başlıklar: {titles}\")\n",
276
- "print(f\"Başlık sayısı: {title_count}\")\n",
277
- "#print(f\"Metinler: {texts}\")\n",
278
- "print(f\"Metin sayısı: {len(texts)}\")"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
279
  ]
280
  },
281
  {
282
- "cell_type": "markdown",
 
283
  "metadata": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
284
  "source": [
285
- "TF-IDF HESAPLAMA"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
286
  ]
287
  },
288
  {
289
  "cell_type": "code",
290
- "execution_count": 20,
291
  "metadata": {},
292
  "outputs": [
293
  {
294
- "data": {
295
- "text/plain": [
296
- "\"turkish_stop_words = set([\\n 'ad', 'adım', 'ah', 'ama', 'an', 'ancak', 'araba', 'aralar', 'aslında', \\n 'b', 'bazı', 'belirli', 'ben', 'bence', 'bunu', 'burada', 'biz', 'bu', 'buna', 'çünkü', \\n 'da', 'de', 'demek', 'den', 'derken', 'değil', 'daha', 'dolayı', 'edilir', 'eğer', 'en', 'fakat', \\n 'genellikle', 'gibi', 'hem', 'her', 'herhangi', 'hiç', 'ise', 'işte', 'itibaren', 'iyi', 'kadar', \\n 'karşı', 'ki', 'kime', 'kısaca', 'mu', 'mü', 'nasıl', 'ne', 'neden', 'niye', 'o', 'olabilir', 'oluşur', \\n 'önce', 'şu', 'sadece', 'se', 'şey', 'şimdi', 'tabi', 'tüm', 've', 'ya', 'ya da', 'yani', 'yine'\\n])\\ndef calculate_tfidf(documents):\\n vectorizer = TfidfVectorizer(stop_words=turkish_stop_words, max_features=10000) # max_features ile özellik sayısını sınırlıyoruz\\n tfidf_matrix = vectorizer.fit_transform(documents)\\n feature_names = vectorizer.get_feature_names_out()\\n return tfidf_matrix, feature_names\\n\\n#feature_names lerin belirlenmesi grekir \\ntfidf_matrix, feature_names=calculate_tfidf(documents)\\n\\n\\n\\n# En yüksek TF-IDF skorlarına sahip anahtar kelimeleri çıkarın\\n#sıkışık format kullanmarak tf-ıdf matrisini işleme \\ndef get_top_n_keywords_sparse(n=10):\\n\\n # TF-IDF hesaplayıcı oluşturun\\n vectorizer = TfidfVectorizer()\\n\\n # Başlıklar ve metinler ile TF-IDF matrisini oluşturun\\n texts = Database.get_input_texts()\\n titles = Database.get_input_titles()\\n \\n\\n #title ve text değerlerini alarak vektörleştirdik.\\n tfidf_matrix = vectorizer.fit_transform(documents)\\n\\n # Özellik adlarını (kelimeleri) alın\\n\\n feature_names = vectorizer.get_feature_names_out()\\n\\n # TF-IDF sonuçlarını DataFrame'e dönüştürün\\n df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)\\n print(df)\\n keywords = {}\\n for i in range(tfidf_matrix.shape[0]):\\n row = tfidf_matrix[i].toarray().flatten() #list yapısından çıkarma \\n sorted_indices = row.argsort()[::-1] # Büyükten küçüğe sıralama\\n top_indices = sorted_indices[:n]\\n top_keywords = [feature_names[idx] for idx in top_indices]\\n keywords[i] = top_keywords\\n return keywords\""
297
- ]
298
- },
299
- "execution_count": 20,
300
- "metadata": {},
301
- "output_type": "execute_result"
 
 
 
 
 
 
 
 
302
  }
303
  ],
304
  "source": [
@@ -315,6 +776,8 @@
315
  " def get_mongodb():\n",
316
  " return 'mongodb://localhost:27017/', 'EgitimDatabase', 'train'\n",
317
  "\n",
 
 
318
  " @staticmethod\n",
319
  " def get_input_documents(limit=3):\n",
320
  " mongo_url, db_name, collection_name = Database.get_mongodb()\n",
@@ -322,11 +785,11 @@
322
  " db = client[db_name]\n",
323
  " collection = db[collection_name]\n",
324
  " cursor = collection.find().limit(limit)\n",
325
- " documents = [doc for doc in cursor]\n",
326
- " document_count = len(documents)\n",
327
  " \n",
328
  " # Dökümanları isimlendir\n",
329
- " named_documents = {f'döküman {i+1}': doc for i, doc in enumerate(documents)}\n",
330
  " \n",
331
  " return named_documents, document_count\n",
332
  "\n",
@@ -353,7 +816,7 @@
353
  " return Database.get_input_documents(limit)\n",
354
  "\n",
355
  "# Kullanım örneği\n",
356
- "named_documents, document_count = Tf.get_input_documents(limit=3)\n",
357
  "\n",
358
  "#tf-ıdf ile döküman içerisinden kelime seçme \n",
359
  "\n",
@@ -387,23 +850,30 @@
387
  " for word, score in sorted_words[:3]:\n",
388
  " print(\"\\tWord: {}, TF-IDF: {}\".format(word, round(score, 5)))\n",
389
  "\n",
 
 
390
  "turkish_stop_words = [\n",
391
  " 'ah', 'ama', 'an', 'ancak', 'araba', 'aralar', 'aslında', \n",
392
- " 'b','başlayan','bağlı', 'bazı', 'belirli', 'ben', 'bence','birkaç','birlikte', 'bunu', 'burada','biten','biten' ,'biz', 'bu', 'buna', 'çünkü', \n",
393
- " 'da', 'de', 'demek', 'den', 'derken', 'değil', 'daha', 'dolayı', 'edilir', 'eğer', 'en', 'fakat', \n",
394
- " 'genellikle', 'gibi', 'hem', 'her', 'herhangi', 'hiç', 'ise', 'işte', 'itibaren', 'iyi', 'kadar', \n",
395
- " 'karşı', 'ki', 'kime', 'kısaca', 'mu', '', 'nasıl', 'ne', 'neden', 'niye', 'o', 'olasılıkla','olabilir', 'oluşur', \n",
396
- " 'önce', 'şu', 'sadece', 'se', 'şey', 'şimdi', 'tabi', 'tüm', 've', 'ya', 'ya da','yanı' ,'yanı','yani','yılında','yılında','yetenekli', 'yine'\n",
 
 
 
 
 
397
  "]\n",
398
  "\n",
399
- "#featuresların eklenmesi gerekir \n",
400
- "def calculate_tfidf(documents, stop_words):\n",
401
  " vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=10000)\n",
402
- " tfidf_matrix = vectorizer.fit_transform(documents)\n",
403
  " feature_names = vectorizer.get_feature_names_out()\n",
404
  " return tfidf_matrix, feature_names\n",
405
  "\n",
406
- "\n",
407
  "#kelimelerin ortalama skorlarını hesaplama \n",
408
  "def identify_low_tfidf_words(tfidf_matrix, feature_names, threshold=0.001):\n",
409
  " # TF-IDF skorlarını toplayarak her kelimenin ortalama skorunu hesaplayın\n",
@@ -411,48 +881,54 @@
411
  " low_tfidf_words = [feature_names[i] for i, score in enumerate(avg_scores) if score < threshold]\n",
412
  " return low_tfidf_words\n",
413
  "\n",
414
- "#kelimelerin güncellenmesi \n",
415
  "def update_stop_words(existing_stop_words, low_tfidf_words):\n",
416
  " updated_stop_words = set(existing_stop_words) | set(low_tfidf_words)\n",
417
  " return list(updated_stop_words)\n",
418
  "\n",
419
  "\n",
420
- "def iterative_update(documents, initial_stop_words, iterations=5):\n",
 
421
  " stop_words = set(initial_stop_words)\n",
422
  " for _ in range(iterations):\n",
423
- " tfidf_matrix, feature_names = calculate_tfidf(documents, stop_words)\n",
424
  " low_tfidf_words = identify_low_tfidf_words(tfidf_matrix, feature_names)\n",
425
  " stop_words = update_stop_words(stop_words, low_tfidf_words)\n",
426
  " return list(stop_words)\n",
427
- "stop_words= iterative_update\n",
428
  "\n",
429
  "\n",
430
  "def main ():\n",
431
  "\n",
 
432
  "#anlam ilişkisini de kontrol edecek bir yapı oluşpturulacak title ile benzerlik kontrol ederek yüksek benzerlik içeren kelimler sıralnacak .\n",
433
  "\n",
434
  "# Dökümanları liste olarak al\n",
435
  " documents_list = [doc.get('text', '') if isinstance(doc, dict) else doc for doc in list(named_documents.values())]\n",
436
  "\n",
 
 
 
 
437
  " #tf-ıdf hesaplama\n",
438
- " tfidf_matrix, feature_names=calculate_tfidf(documents_list,stop_words)\n",
439
  "\n",
440
- "# Veritabanından dökümanları alın\n",
441
- " named_documents, document_count = Database.get_input_documents(limit=3)\n",
442
  "\n",
443
- "#başalngıç stop değerleriyle yeni olanları arasında değişim yapma \n",
444
- " initial_stop_words = turkish_stop_words\n",
 
445
  "\n",
446
- "# Stop-words listesini iteratif olarak güncelleyin\n",
447
- " final_stop_words = iterative_update(documents_list, initial_stop_words)\n",
448
  "\n",
449
- " print(\"Güncellenmiş Stop-Words Listesi:\", final_stop_words)\n",
 
450
  "\n",
451
  "\n",
452
  "# Sonuçları yazdır\n",
453
- " print(\"İsimlendirilmiş Dökümanlar:\")\n",
454
- " for name, doc in named_documents.items():\n",
455
- " print(f\"{name}: {doc}\")\n",
456
  "\n",
457
  " print(\"\\nDökümanlar Listesi:\")\n",
458
  " print(documents_list)\n",
@@ -534,9 +1010,21 @@
534
  },
535
  {
536
  "cell_type": "code",
537
- "execution_count": null,
538
  "metadata": {},
539
- "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
540
  "source": [
541
  "\n",
542
  "#---------------------------------------------------------------------------------------------------------------------------------\n",
 
9
  },
10
  {
11
  "cell_type": "code",
12
+ "execution_count": 6,
13
  "metadata": {},
14
+ "outputs": [],
 
 
 
 
 
 
 
 
 
15
  "source": [
16
  "from datasets import load_dataset\n",
17
  "import pandas as pd \n",
 
30
  },
31
  {
32
  "cell_type": "code",
33
+ "execution_count": 8,
34
  "metadata": {},
35
  "outputs": [],
36
  "source": [
 
41
  },
42
  {
43
  "cell_type": "code",
44
+ "execution_count": 9,
45
  "metadata": {},
46
  "outputs": [],
47
  "source": [
 
51
  },
52
  {
53
  "cell_type": "code",
54
+ "execution_count": 10,
55
  "metadata": {},
56
  "outputs": [],
57
  "source": [
 
62
  },
63
  {
64
  "cell_type": "code",
65
+ "execution_count": 11,
66
  "metadata": {},
67
  "outputs": [],
68
  "source": [
 
92
  },
93
  {
94
  "cell_type": "code",
95
+ "execution_count": 12,
96
  "metadata": {},
97
  "outputs": [
98
  {
 
142
  },
143
  {
144
  "cell_type": "code",
145
+ "execution_count": 13,
146
  "metadata": {},
147
  "outputs": [
148
  {
 
197
  " return train_collection,test_collection\n",
198
  "\n",
199
  "# Train ve test datasetlerini MongoDB'ye yüklemek için fonksiyonu çağır\n",
200
+ "train_file_path = 'C:\\\\gitProjects\\\\yeni\\\\datasets\\\\train_Egitim\\\\merged_train.parquet'\n",
201
+ "test_file_path = 'C:\\\\gitProjects\\\\yeni\\\\datasets\\\\test_Egitim\\\\merged_test.parquet'\n",
202
  "\n",
203
  "train_collection, test_collection = dataset_read(train_file_path, test_file_path)"
204
  ]
205
  },
206
+ {
207
+ "cell_type": "code",
208
+ "execution_count": null,
209
+ "metadata": {},
210
+ "outputs": [],
211
+ "source": [
212
+ "import pandas as pd\n",
213
+ "from pymongo import MongoClient,errors\n",
214
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
215
+ "from sentence_transformers import SentenceTransformer\n",
216
+ "\n",
217
+ "# MongoDB bağlantı ve koleksiyon seçimi için fonksiyon\n",
218
+ "def get_mongodb(database_name='EgitimDatabase', train_collection_name='train', test_collection_name='test', host='localhost', port=27017):\n",
219
+ " client = MongoClient(f'mongodb://{host}:{port}/')\n",
220
+ " db = client[database_name]\n",
221
+ " train_collection = db[train_collection_name]\n",
222
+ " test_collection = db[test_collection_name]\n",
223
+ " return train_collection, test_collection\n",
224
+ "\n",
225
+ "# Dataset'i MongoDB'ye yükleme fonksiyonu\n",
226
+ "def dataset_read(train_file_path, test_file_path):\n",
227
+ " try:\n",
228
+ " # MongoDB koleksiyonlarını al\n",
229
+ " train_collection, test_collection = get_mongodb()\n",
230
+ "\n",
231
+ " # Eğer koleksiyonlar zaten doluysa, veri yüklemesi yapma\n",
232
+ " if train_collection.estimated_document_count() > 0 or test_collection.estimated_document_count() > 0:\n",
233
+ " print(\"Veriler zaten yüklendi, işlem yapılmadı.\")\n",
234
+ " return train_collection, test_collection\n",
235
+ "\n",
236
+ " # Datasetleri oku\n",
237
+ " data_train = pd.read_parquet(train_file_path, columns=['id', 'url', 'title', 'text'])\n",
238
+ " data_test = pd.read_parquet(test_file_path, columns=['id', 'url', 'title', 'text'])\n",
239
+ "\n",
240
+ " # Verileri MongoDB'ye yükle\n",
241
+ " train_collection.insert_many(data_train.to_dict(\"records\"))\n",
242
+ " test_collection.insert_many(data_test.to_dict(\"records\"))\n",
243
+ "\n",
244
+ " print(f\"Veriler başarıyla {train_collection.name} koleksiyonuna yüklendi.\")\n",
245
+ " print(f\"Veriler başarıyla {test_collection.name} koleksiyonuna yüklendi.\")\n",
246
+ " \n",
247
+ " except errors.PyMongoError as e:\n",
248
+ " print(f\"Veri yükleme sırasında hata oluştu: {e}\")\n",
249
+ "\n",
250
+ " return train_collection, test_collection\n",
251
+ "\n",
252
+ "\n",
253
+ "\n",
254
+ "# Database sınıfı: Veritabanı bağlantıları ve verileri çekme işlevleri\n",
255
+ "class Database:\n",
256
+ " @staticmethod\n",
257
+ " def get_mongodb():\n",
258
+ " return get_mongodb()\n",
259
+ "\n",
260
+ " @staticmethod\n",
261
+ " def get_titles_and_texts():\n",
262
+ " # MongoDB bağlantısı ve koleksiyonları al\n",
263
+ " train_collection, _ = Database.get_mongodb()\n",
264
+ "\n",
265
+ " # Sorgu: Hem \"title\" hem de \"text\" alanı mevcut olan belgeler\n",
266
+ " query = {\"title\": {\"$exists\": True}, \"text\": {\"$exists\": True}}\n",
267
+ "\n",
268
+ " # Belirtilen alanları seçiyoruz: \"title\", \"text\"\n",
269
+ " cursor = train_collection.find(query, {\"title\": 1, \"text\": 1, \"_id\": 0})\n",
270
+ "\n",
271
+ " # Başlık ve metinleri doğru bir şekilde birleştiriyoruz\n",
272
+ " documents = [{\"title\": doc['title'], \"text\": doc['text']} for doc in cursor]\n",
273
+ " document_count = len(documents)\n",
274
+ " return documents, document_count\n",
275
+ "\n",
276
+ "# Train ve test datasetlerini MongoDB'ye yüklemek için fonksiyonu çağır\n",
277
+ "train_file_path = 'C:\\\\gitProjects\\\\yeni\\\\datasets\\\\train_Egitim\\\\merged_train.parquet'\n",
278
+ "test_file_path = 'C:\\\\gitProjects\\\\yeni\\\\datasets\\\\test_Egitim\\\\merged_test.parquet'\n",
279
+ "\n",
280
+ "train_collection, test_collection = dataset_read(train_file_path, test_file_path)\n",
281
+ "\n",
282
+ "# Veritabanından başlıklar ve metinler alınır\n",
283
+ "documents, document_count = Database.get_titles_and_texts()\n",
284
+ "\n",
285
+ "# Sonuçların belirlenmesi\n",
286
+ "print(f\"Başlık ve metin çiftleri: {documents}\")\n",
287
+ "print(f\"Toplam çift sayısı: {document_count}\")\n"
288
+ ]
289
+ },
290
  {
291
  "cell_type": "markdown",
292
  "metadata": {},
 
300
  "metadata": {},
301
  "outputs": [],
302
  "source": [
303
+ "\"\"\"@staticmethod\n",
304
+ " def get_input_titles():\n",
305
+ " collection = Database.get_mongodb(collection_name='train')\n",
306
+ " query = {\"title\": {\"$exists\": True}}\n",
307
+ " cursor = collection.find(query, {\"title\": 1, \"_id\": 0})\n",
308
+ " title_from_db = [doc['title'] for doc in cursor]\n",
309
+ "\n",
310
+ " return title_from_db\"\"\"\n",
311
+ "\n",
312
+ "\"\"\"@staticmethod\n",
313
+ " def get_input_texts():\n",
314
+ " collection = Database.get_mongodb(collection_name='train')\n",
315
+ " query = {\"texts\": {\"$exists\": True}}\n",
316
+ " cursor = collection.find(query, {\"texts\": 1, \"_id\": 0})\n",
317
+ " texts_from_db = [doc['texts'] for doc in cursor]\n",
318
+ " return texts_from_db\"\"\"\n",
319
+ " \n",
320
+ " #bin tane veri çekerek csv dosyası olarak kaydetme \n",
321
+ " \n",
322
+ " \n",
323
+ "\"\"\"@staticmethod\n",
324
+ " def get_titles_and_texts(batch_size=1000):\n",
325
+ "\n",
326
+ " \n",
327
+ " titles = Database.get_input_titles(batch_size=batch_size)\n",
328
+ " texts = Database.get_input_texts(batch_size=batch_size )\n",
329
+ " \n",
330
+ "\n",
331
+ "\n",
332
+ " def test_queries():\n",
333
+ "\n",
334
+ " collection = Database.get_mongodb(collection_name='train')\n",
335
+ " # Başlık sorgusu\n",
336
+ " titles_cursor = collection.find({\"title\": {\"$exists\": True}}, {\"title\": 1, \"_id\": 0})\n",
337
+ " titles = [doc['title'] for doc in titles_cursor]\n",
338
+ " \n",
339
+ "\n",
340
+ " # Metin sorgusu\n",
341
+ " texts_cursor = collection.find({\"text\": {\"$exists\": True}}, {\"text\": 1, \"_id\": 0})\n",
342
+ " texts = [doc['text'] for doc in texts_cursor]\n",
343
+ " \n",
344
+ " # Başlık ve metinlerin eşleşmesini sağlamak için zip kullanarak birleştiriyoruz\n",
345
+ " documents = [{\"title\": title, \"text\": text} for title, text in zip(titles, texts)]\n",
346
+ " document_count = len(documents)\n",
347
+ " return documents, document_count\n",
348
+ "\n",
349
+ "Database.test_queries()\n",
350
+ "\n",
351
+ "# Veritabanından başlıklar ve metinler alınır\n",
352
+ "documents, document_count = Database.get_titles_and_texts(batch_size=1000)\n",
353
+ "\n",
354
+ "# Sonuçların belirlenmesi\n",
355
+ "print(f\"Başlık ve metin çiftleri: {documents}\")\n",
356
+ "print(f\"Toplam çift sayısı: {document_count}\")\"\"\""
357
+ ]
358
+ },
359
+ {
360
+ "cell_type": "markdown",
361
+ "metadata": {},
362
+ "source": [
363
+ "Output'u vereceğimiz title ve textin kodu"
364
+ ]
365
+ },
366
+ {
367
+ "cell_type": "code",
368
+ "execution_count": 8,
369
+ "metadata": {},
370
+ "outputs": [
371
+ {
372
+ "name": "stdout",
373
+ "output_type": "stream",
374
+ "text": [
375
+ "0 **Pşıqo Ahecaqo** Pşıqo Ahecaqo (), Çerkes siy...\n",
376
+ "1 **Craterolophinae** Craterolophinae, Depastrid...\n",
377
+ "2 **Notocrabro** Notocrabro Crabronina oymağına ...\n",
378
+ "3 **Ibrahim Sissoko** İbrahim Sissoko (d. 30 Kas...\n",
379
+ "4 **Salah Cedid** Salah Cedid (1926-1993) (Arapç...\n",
380
+ "Name: combined, dtype: object\n",
381
+ "Veriler combined_output.csv dosyasına başarıyla kaydedildi.\n"
382
+ ]
383
+ }
384
+ ],
385
+ "source": [
386
+ "from pymongo import MongoClient\n",
387
+ "import pandas as pd\n",
388
+ "from tqdm.auto import tqdm, trange\n",
389
+ "\n",
390
+ "# Database bağlantıları ve verileri çekme işlevleri\n",
391
+ "class Database:\n",
392
+ " @staticmethod\n",
393
+ " def get_mongodb(database_name='EgitimDatabase', train_collection_name='train', test_collection_name='test', host='localhost', port=27017):\n",
394
+ " client = MongoClient(f'mongodb://{host}:{port}/')\n",
395
+ " db = client[database_name]\n",
396
+ " train_collection = db[train_collection_name]\n",
397
+ " test_collection = db[test_collection_name]\n",
398
+ " return train_collection, test_collection\n",
399
+ "\n",
400
+ " def export_to_csv(batch_size=1000, output_file='combined_output.csv'):\n",
401
+ " train_collection, _ = Database.get_mongodb()\n",
402
+ " cursor = train_collection.find({}, {\"title\": 1, \"text\": 1, \"_id\": 0})\n",
403
+ " cursor = cursor.batch_size(batch_size) # Fix: Call batch_size on the cursor object\n",
404
+ "\n",
405
+ " # Verileri DataFrame'e dönüştürme\n",
406
+ " df= pd.DataFrame(list(cursor))\n",
407
+ " \n",
408
+ " # title ve text sütunlarını birleştirme\n",
409
+ " df['combined'] = df.apply(lambda row: f'**{row[\"title\"]}** {row[\"text\"]}', axis=1)\n",
410
+ " \n",
411
+ " #title,text and combined sütunlarını ayrı ayrı tutma\n",
412
+ " #df2['title_only'] = df2['title']\n",
413
+ " #df2['text_only'] = df2['text']\n",
414
+ " #df['combined']= output_file\n",
415
+ "\n",
416
+ " # Sonuçları kontrol etme\n",
417
+ " combined_text= df['combined'] \n",
418
+ " # Print the combined column directly\n",
419
+ " \n",
420
+ " print(combined_text.head())\n",
421
+ "\n",
422
+ " # Birleşmiş verileri CSV'ye kaydetme\n",
423
+ " \n",
424
+ " df.to_csv(output_file, index=False)\n",
425
+ " \n",
426
+ " print(f\"Veriler combined_output.csv dosyasına başarıyla kaydedildi.\")\n",
427
+ " \n",
428
+ "\n",
429
+ "# CSV dosyasını okuma ve birleştirme işlemi\n",
430
+ "Database.export_to_csv()"
431
+ ]
432
+ },
433
+ {
434
+ "cell_type": "markdown",
435
+ "metadata": {},
436
+ "source": [
437
+ "TF-IDF HESAPLAMA"
438
+ ]
439
+ },
440
+ {
441
+ "cell_type": "code",
442
+ "execution_count": 20,
443
+ "metadata": {},
444
+ "outputs": [
445
+ {
446
+ "name": "stderr",
447
+ "output_type": "stream",
448
+ "text": [
449
+ "[nltk_data] Downloading package wordnet to\n",
450
+ "[nltk_data] C:\\Users\\info\\AppData\\Roaming\\nltk_data...\n",
451
+ "[nltk_data] Package wordnet is already up-to-date!\n",
452
+ "[nltk_data] Downloading package omw-1.4 to\n",
453
+ "[nltk_data] C:\\Users\\info\\AppData\\Roaming\\nltk_data...\n",
454
+ "[nltk_data] Package omw-1.4 is already up-to-date!\n",
455
+ "[nltk_data] Downloading package stopwords to\n",
456
+ "[nltk_data] C:\\Users\\info\\AppData\\Roaming\\nltk_data...\n",
457
+ "[nltk_data] Package stopwords is already up-to-date!\n"
458
+ ]
459
+ },
460
+ {
461
+ "ename": "ValueError",
462
+ "evalue": "empty vocabulary; perhaps the documents only contain stop words",
463
+ "output_type": "error",
464
+ "traceback": [
465
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
466
+ "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
467
+ "Cell \u001b[1;32mIn[20], line 100\u001b[0m\n\u001b[0;32m 97\u001b[0m documents, document_count \u001b[38;5;241m=\u001b[39m Database\u001b[38;5;241m.\u001b[39mget_input_documents()\n\u001b[0;32m 99\u001b[0m \u001b[38;5;66;03m# Calculate TF-IDF and get feature names\u001b[39;00m\n\u001b[1;32m--> 100\u001b[0m tfidf_matrix, feature_names \u001b[38;5;241m=\u001b[39m \u001b[43mDatabase\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate_tfidf\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdocuments\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mturkish_stop_words\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 102\u001b[0m \u001b[38;5;66;03m# Extract keywords\u001b[39;00m\n\u001b[0;32m 103\u001b[0m keywords \u001b[38;5;241m=\u001b[39m Database\u001b[38;5;241m.\u001b[39mextract_keywords(tfidf_matrix, feature_names, stop_words\u001b[38;5;241m=\u001b[39mturkish_stop_words)\n",
468
+ "Cell \u001b[1;32mIn[20], line 43\u001b[0m, in \u001b[0;36mDatabase.calculate_tfidf\u001b[1;34m(documents, stop_words)\u001b[0m\n\u001b[0;32m 40\u001b[0m \u001b[38;5;129m@staticmethod\u001b[39m\n\u001b[0;32m 41\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mcalculate_tfidf\u001b[39m(documents, stop_words):\n\u001b[0;32m 42\u001b[0m vectorizer \u001b[38;5;241m=\u001b[39m TfidfVectorizer(stop_words\u001b[38;5;241m=\u001b[39mstop_words, max_features\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m10000\u001b[39m,min_df\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m2\u001b[39m)\n\u001b[1;32m---> 43\u001b[0m tfidf_matrix \u001b[38;5;241m=\u001b[39m \u001b[43mvectorizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_transform\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdocuments\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 44\u001b[0m feature_names \u001b[38;5;241m=\u001b[39m vectorizer\u001b[38;5;241m.\u001b[39mget_feature_names_out()\n\u001b[0;32m 45\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m tfidf_matrix, feature_names\n",
469
+ "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sklearn\\feature_extraction\\text.py:2091\u001b[0m, in \u001b[0;36mTfidfVectorizer.fit_transform\u001b[1;34m(self, raw_documents, y)\u001b[0m\n\u001b[0;32m 2084\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_params()\n\u001b[0;32m 2085\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_tfidf \u001b[38;5;241m=\u001b[39m TfidfTransformer(\n\u001b[0;32m 2086\u001b[0m norm\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnorm,\n\u001b[0;32m 2087\u001b[0m use_idf\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39muse_idf,\n\u001b[0;32m 2088\u001b[0m smooth_idf\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msmooth_idf,\n\u001b[0;32m 2089\u001b[0m sublinear_tf\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msublinear_tf,\n\u001b[0;32m 2090\u001b[0m )\n\u001b[1;32m-> 2091\u001b[0m X \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_transform\u001b[49m\u001b[43m(\u001b[49m\u001b[43mraw_documents\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 2092\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_tfidf\u001b[38;5;241m.\u001b[39mfit(X)\n\u001b[0;32m 2093\u001b[0m \u001b[38;5;66;03m# X is already a transformed view of raw_documents so\u001b[39;00m\n\u001b[0;32m 2094\u001b[0m \u001b[38;5;66;03m# we set copy to False\u001b[39;00m\n",
470
+ "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sklearn\\base.py:1473\u001b[0m, in \u001b[0;36m_fit_context.<locals>.decorator.<locals>.wrapper\u001b[1;34m(estimator, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1466\u001b[0m estimator\u001b[38;5;241m.\u001b[39m_validate_params()\n\u001b[0;32m 1468\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m 1469\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m 1470\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m 1471\u001b[0m )\n\u001b[0;32m 1472\u001b[0m ):\n\u001b[1;32m-> 1473\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m fit_method(estimator, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
471
+ "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sklearn\\feature_extraction\\text.py:1372\u001b[0m, in \u001b[0;36mCountVectorizer.fit_transform\u001b[1;34m(self, raw_documents, y)\u001b[0m\n\u001b[0;32m 1364\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[0;32m 1365\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUpper case characters found in\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 1366\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m vocabulary while \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mlowercase\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 1367\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m is True. These entries will not\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 1368\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m be matched with any documents\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 1369\u001b[0m )\n\u001b[0;32m 1370\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[1;32m-> 1372\u001b[0m vocabulary, X \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_count_vocab\u001b[49m\u001b[43m(\u001b[49m\u001b[43mraw_documents\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfixed_vocabulary_\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1374\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbinary:\n\u001b[0;32m 1375\u001b[0m X\u001b[38;5;241m.\u001b[39mdata\u001b[38;5;241m.\u001b[39mfill(\u001b[38;5;241m1\u001b[39m)\n",
472
+ "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sklearn\\feature_extraction\\text.py:1278\u001b[0m, in \u001b[0;36mCountVectorizer._count_vocab\u001b[1;34m(self, raw_documents, fixed_vocab)\u001b[0m\n\u001b[0;32m 1276\u001b[0m vocabulary \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mdict\u001b[39m(vocabulary)\n\u001b[0;32m 1277\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m vocabulary:\n\u001b[1;32m-> 1278\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[0;32m 1279\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mempty vocabulary; perhaps the documents only contain stop words\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 1280\u001b[0m )\n\u001b[0;32m 1282\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m indptr[\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m] \u001b[38;5;241m>\u001b[39m np\u001b[38;5;241m.\u001b[39miinfo(np\u001b[38;5;241m.\u001b[39mint32)\u001b[38;5;241m.\u001b[39mmax: \u001b[38;5;66;03m# = 2**31 - 1\u001b[39;00m\n\u001b[0;32m 1283\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m _IS_32BIT:\n",
473
+ "\u001b[1;31mValueError\u001b[0m: empty vocabulary; perhaps the documents only contain stop words"
474
+ ]
475
+ }
476
+ ],
477
+ "source": [
478
+ "#---------------------------güncel en yeni \n",
479
+ "from pymongo import MongoClient\n",
480
  "from sklearn.feature_extraction.text import TfidfVectorizer\n",
481
+ "from textblob import TextBlob as tb\n",
482
+ "import numpy as np\n",
483
+ "import math\n",
484
+ "from tqdm.auto import tqdm, trange\n",
485
+ "import tensorflow as tf\n",
486
+ "import nltk\n",
487
+ "from nltk.stem import WordNetLemmatizer\n",
488
+ "from nltk.corpus import stopwords\n",
489
  "\n",
490
+ "turkish_stop_words = stopwords.words('turkish')\n",
491
+ "\n",
492
+ "nltk.download('wordnet')\n",
493
+ "nltk.download('omw-1.4')\n",
494
+ "nltk.download('stopwords')\n",
495
+ "\n",
496
+ "\n",
497
+ "import matplotlib.pyplot as plt \n",
498
  "\n",
 
 
 
499
  "class Database:\n",
500
  " @staticmethod\n",
501
  " def get_mongodb():\n",
502
+ " return 'mongodb://localhost:27017/', 'combined', 'combined_output'\n",
 
503
  "\n",
504
+ " # Get input documents from MongoDB\n",
505
  " @staticmethod\n",
506
+ " def get_input_documents(limit=1000):\n",
507
  " mongo_url, db_name, collection_name = Database.get_mongodb()\n",
508
  " client = MongoClient(mongo_url)\n",
509
  " db = client[db_name]\n",
510
  " collection = db[collection_name]\n",
511
+ " cursor = collection.find().limit(limit)\n",
512
+ " combined_text = [doc['text'] for doc in cursor]\n",
513
+ " document_count = len(combined_text)\n",
514
+ " return combined_text, document_count\n",
 
 
515
  " \n",
516
+ " # Calculate TF-IDF and get feature names\n",
517
  " @staticmethod\n",
518
+ " def calculate_tfidf(documents, stop_words):\n",
519
+ " vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=10000,min_df=2)\n",
520
+ " tfidf_matrix = vectorizer.fit_transform(documents)\n",
521
+ " feature_names = vectorizer.get_feature_names_out()\n",
522
+ " return tfidf_matrix, feature_names\n",
523
+ "\n",
524
+ " # Extract keywords using TF-IDF\n",
525
+ " def extract_keywords(tfidf_matrix, feature_names, top_n=10, stop_words=[]):\n",
526
+ " keywords = {}\n",
527
+ " for doc_idx, row in enumerate(tfidf_matrix):\n",
528
+ " filtered_feature_names = [name for name in feature_names if name.lower() not in stop_words]\n",
529
+ " scores = np.asarray(row.T.todense()).flatten()\n",
530
+ " sorted_indices = np.argsort(scores)[::-1]\n",
531
+ " top_features = sorted_indices[:top_n]\n",
532
+ " doc_keywords = [(filtered_feature_names[idx], scores[idx]) for idx in top_features]\n",
533
+ " keywords[f'document_{doc_idx+1}'] = doc_keywords\n",
534
+ " return keywords\n",
535
+ " \n",
536
+ " #zip keywords and combined text \n",
537
+ " \n",
538
+ " # Identify low TF-IDF words\n",
539
+ " @staticmethod\n",
540
+ " def identify_low_tfidf_words(tfidf_matrix, feature_names, threshold=0.001):\n",
541
+ " avg_scores = np.mean(tfidf_matrix, axis=0).A1\n",
542
+ " low_tfidf_words = [feature_names[i] for i, score in enumerate(avg_scores) if score < threshold]\n",
543
+ " return low_tfidf_words\n",
544
+ " \n",
545
+ " # Update stop words with low TF-IDF words\n",
546
+ " @staticmethod\n",
547
+ " def update_stop_words(existing_stop_words, low_tfidf_words):\n",
548
+ " updated_stop_words = set(existing_stop_words) | set(low_tfidf_words)\n",
549
+ " return list(updated_stop_words)\n",
550
  "\n",
551
  "\n",
552
+ "#tf-ıdf ile döküman içerisinden kelime seçme \n",
553
+ "#Term Frequency (TF): Bir kelimenin belli bir dökümanda tekrar etme değeri\n",
554
+ "#Inverse Document Frequency (IDF):bir kelimenin tüm dökümanlar arasındaki yaygınlığı Nadir bulunan kelimeler, daha yüksek IDF değerine sahip olur.\n",
555
+ "#tf-ıdf skoru ise bu ikisinin çarpımıdır.\n",
556
  "\n",
557
+ " #buraya eşik değer belirlenmeli\n",
558
+ "\n",
559
+ "\n",
560
+ "turkish_stop_words = [\n",
561
+ " 'ah', 'ama', 'an', 'ancak', 'araba', 'aralar', 'aslında', \n",
562
+ " 'b', 'başlayan', 'bağlı', 'bazı', 'belirli', 'ben', 'bence', \n",
563
+ " 'birkaç', 'birlikte', 'bunu', 'burada', 'biten', 'biz', \n",
564
+ " 'bu', 'buna', 'çünkü', 'da', 'de', 'demek', 'den', 'derken', \n",
565
+ " 'değil', 'daha', 'dolayı', 'edilir', 'eğer', 'en', 'fakat', \n",
566
+ " 'genellikle', 'gibi', 'hem', 'her', 'herhangi', 'hiç', 'ise', \n",
567
+ " 'işte', 'itibaren', 'iyi', 'kadar', 'karşı', 'ki', 'kime', \n",
568
+ " 'kısaca', 'mu', 'mü', 'nasıl', 'ne', 'neden', 'niye', 'o', \n",
569
+ " 'olasılıkla', 'olabilir', 'oluşur', 'önce', 'şu', 'sadece', \n",
570
+ " 'se', 'şey', 'şimdi', 'tabi', 'tüm', 've', 'ya', 'ya da', \n",
571
+ " 'yanı', 'yani', 'yılında', 'yetenekli', 'yine'\n",
572
+ "]\n",
573
+ "# Get input documents\n",
574
+ "documents, document_count = Database.get_input_documents()\n",
575
+ "\n",
576
+ "# Calculate TF-IDF and get feature names\n",
577
+ "tfidf_matrix, feature_names = Database.calculate_tfidf(documents, turkish_stop_words)\n",
578
+ "\n",
579
+ "# Extract keywords\n",
580
+ "keywords = Database.extract_keywords(tfidf_matrix, feature_names, stop_words=turkish_stop_words)\n",
581
+ "print(keywords)\n",
582
+ "\n",
583
+ "# Identify low TF-IDF words\n",
584
+ "low_tfidf_words = Database.identify_low_tfidf_words(tfidf_matrix, feature_names)\n",
585
+ "print(low_tfidf_words)\n",
586
+ "\n",
587
+ "# Update stop words\n",
588
+ "updated_stop_words = Database.update_stop_words(turkish_stop_words, low_tfidf_words)\n",
589
+ "print(updated_stop_words) "
590
  ]
591
  },
592
  {
593
+ "cell_type": "code",
594
+ "execution_count": 15,
595
  "metadata": {},
596
+ "outputs": [
597
+ {
598
+ "ename": "TypeError",
599
+ "evalue": "unhashable type: 'set'",
600
+ "output_type": "error",
601
+ "traceback": [
602
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
603
+ "\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)",
604
+ "Cell \u001b[1;32mIn[15], line 162\u001b[0m\n\u001b[0;32m 159\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m tfidf_matrix, feature_names,keywords\n\u001b[0;32m 161\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;18m__name__\u001b[39m\u001b[38;5;241m==\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m__main__\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m--> 162\u001b[0m tfidf_matrix, feature_names,keywords\u001b[38;5;241m=\u001b[39m \u001b[43mmain\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 164\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAnahtar Kelimler:\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 165\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m doc, words \u001b[38;5;129;01min\u001b[39;00m keywords\u001b[38;5;241m.\u001b[39mitems():\n",
605
+ "Cell \u001b[1;32mIn[15], line 148\u001b[0m, in \u001b[0;36mmain\u001b[1;34m()\u001b[0m\n\u001b[0;32m 146\u001b[0m initial_stop_words \u001b[38;5;241m=\u001b[39m turkish_stop_words\n\u001b[0;32m 147\u001b[0m \u001b[38;5;66;03m# Stop-words listesini iteratif olarak güncelleyin\u001b[39;00m\n\u001b[1;32m--> 148\u001b[0m final_stop_words \u001b[38;5;241m=\u001b[39m \u001b[43miterative_update\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdocuments_list\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minitial_stop_words\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 149\u001b[0m \u001b[38;5;66;03m#tf-ıdf hesaplama\u001b[39;00m\n\u001b[0;32m 150\u001b[0m tfidf_matrix, feature_names\u001b[38;5;241m=\u001b[39mcalculate_tfidf(documents_list,final_stop_words)\n",
606
+ "Cell \u001b[1;32mIn[15], line 127\u001b[0m, in \u001b[0;36miterative_update\u001b[1;34m(documents, initial_stop_words, iterations)\u001b[0m\n\u001b[0;32m 126\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21miterative_update\u001b[39m(documents, initial_stop_words, iterations\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m5\u001b[39m):\n\u001b[1;32m--> 127\u001b[0m stop_words \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mset\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43minitial_stop_words\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 128\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m _ \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(iterations):\n\u001b[0;32m 129\u001b[0m tfidf_matrix, feature_names \u001b[38;5;241m=\u001b[39m calculate_tfidf(documents, stop_words)\n",
607
+ "\u001b[1;31mTypeError\u001b[0m: unhashable type: 'set'"
608
+ ]
609
+ }
610
+ ],
611
  "source": [
612
+ "\n",
613
+ "\n",
614
+ "\"\"\"class Tf:\n",
615
+ " @staticmethod\n",
616
+ " def tf(word, blob):\n",
617
+ " return blob.words.count(word) / len(blob.words)\n",
618
+ "\n",
619
+ " @staticmethod\n",
620
+ " def n_containing(word, bloblist):\n",
621
+ " return sum(1 for blob in bloblist if word in blob.words)\n",
622
+ "\n",
623
+ " @staticmethod\n",
624
+ " def idf(word, bloblist):\n",
625
+ " return math.log(len(bloblist) / (1 + Tf.n_containing(word, bloblist)))\n",
626
+ "\n",
627
+ " @staticmethod\n",
628
+ " def tfidf(word, blob, bloblist):\n",
629
+ " return Tf.tf(word, blob) * Tf.idf(word, bloblist)\n",
630
+ "\n",
631
+ " @staticmethod\n",
632
+ " def get_input_documents(limit=1000):\n",
633
+ " return Database.get_input_documents(limit)\"\"\"\n",
634
+ "\n",
635
+ "\n",
636
+ "\n",
637
+ "\n",
638
+ "\n",
639
+ " \"\"\"\n",
640
+ " Her döküman için anahtar kelimeleri seç.\n",
641
+ " :param tfidf_matrix: TF-IDF matris\n",
642
+ " :param feature_names: TF-IDF özellik isimleri\n",
643
+ " :param top_n: Her döküman için seçilecek anahtar kelime sayısı\n",
644
+ " :return: Anahtar kelimeler ve skorlari\n",
645
+ " \"\"\"\n",
646
+ " \n",
647
+ "\n",
648
+ "#--------------------------------------------------------------- burada aldığımız dökümanları listeliyoruz\n",
649
+ "# Dokümanları işleyerek TF-IDF hesaplama\n",
650
+ "#bloblist dökümanların bir listesi\n",
651
+ "\"\"\"bloblist = []\n",
652
+ "for i, blob in enumerate(bloblist):\n",
653
+ " print(\"Top words in document {}\".format(i + 1))\n",
654
+ " scores = {word: Tf.tfidf(word, blob, bloblist) for word in blob.words} #dökümanların içerisinde bulunan kelimeleri alır.\n",
655
+ " sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)\n",
656
+ " for word, score in sorted_words[:3]:\n",
657
+ " print(\"\\tWord: {}, TF-IDF: {}\".format(word, round(score, 5)))\"\"\"\n",
658
+ "\n",
659
+ "\n",
660
+ "# Dökümanları isimlendir\n",
661
+ "#named_documents = {f'döküman {i+1}': doc for i, doc in enumerate(combined_text)}\n",
662
+ "\n",
663
+ "#features olarak top_keywordsleri belirleyerek metnin bu kelimelerin etrafında olması sağlanmalı \n",
664
+ "def calculate_tfidf(documents, stop_words):\n",
665
+ " vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=10000)\n",
666
+ " tfidf_matrix = vectorizer.fit_transform(documents)\n",
667
+ " feature_names = vectorizer.get_feature_names_out()\n",
668
+ " return tfidf_matrix, feature_names\n",
669
+ "\n",
670
+ "#---------------------------------------------------------------------------------\n",
671
+ "#kelimelerin ortalama skorlarını hesaplama \n",
672
+ "def identify_low_tfidf_words(tfidf_matrix, feature_names, threshold=0.001):\n",
673
+ " # TF-IDF skorlarını toplayarak her kelimenin ortalama skorunu hesaplayın\n",
674
+ " avg_scores = np.mean(tfidf_matrix, axis=0).A1\n",
675
+ " low_tfidf_words = [feature_names[i] for i, score in enumerate(avg_scores) if score < threshold]\n",
676
+ " return low_tfidf_words\n",
677
+ "\n",
678
+ "#kelimelerin yeni geliştirilen eşik değere göre güncellenmesi \n",
679
+ "def update_stop_words(existing_stop_words, low_tfidf_words):\n",
680
+ " updated_stop_words = set(existing_stop_words) | set(low_tfidf_words)\n",
681
+ " return list(updated_stop_words)\n",
682
+ "\n",
683
+ "\n",
684
+ "#bu kısım detaylandırılmalı \n",
685
+ "def iterative_update(documents, initial_stop_words, iterations=5):\n",
686
+ " stop_words = set(initial_stop_words)\n",
687
+ " for _ in range(iterations):\n",
688
+ " tfidf_matrix, feature_names = calculate_tfidf(documents, stop_words)\n",
689
+ " low_tfidf_words = identify_low_tfidf_words(tfidf_matrix, feature_names)\n",
690
+ " stop_words = update_stop_words(stop_words, low_tfidf_words)\n",
691
+ " return list(stop_words)\n",
692
+ "\n",
693
+ "\n",
694
+ "\n",
695
+ "def main ():\n",
696
+ "\n",
697
+ " \n",
698
+ "#anlam ilişkisini de kontrol edecek bir yapı oluşpturulacak title ile benzerlik kontrol ederek yüksek benzerlik içeren kelimler sıralnacak .\n",
699
+ "\n",
700
+ "# Dökümanları liste olarak al\n",
701
+ " named_documents, _ = Tf.get_input_documents(limit=1000)\n",
702
+ " documents_list = [doc.get('text', '') if isinstance(doc, dict) else doc for doc in list(named_documents.values())]\n",
703
+ "\n",
704
+ " #başlangıç stop değerleriyle yeni olanları arasında değişim yapma \n",
705
+ " initial_stop_words = turkish_stop_words\n",
706
+ " # Stop-words listesini iteratif olarak güncelleyin\n",
707
+ " final_stop_words = iterative_update(documents_list, initial_stop_words)\n",
708
+ " #tf-ıdf hesaplama\n",
709
+ " tfidf_matrix, feature_names=calculate_tfidf(documents_list,final_stop_words)\n",
710
+ " keywords=extract_keywords(tfidf_matrix,feature_names,top_n=10)\n",
711
+ "\n",
712
+ " \n",
713
+ "\n",
714
+ " print(\"Güncellenmiş Stop-Words Listesi:\", final_stop_words)\n",
715
+ " print(\"TF-IDF Matrix Shape:\", tfidf_matrix.shape)\n",
716
+ " print(\"Feature Names Sample:\", feature_names[:10]) # İlk 10 feature adını gösterir\n",
717
+ "\n",
718
+ " return tfidf_matrix, feature_names,keywords\n",
719
+ "\n",
720
+ "if __name__==\"__main__\":\n",
721
+ " tfidf_matrix, feature_names,keywords= main()\n",
722
+ "\n",
723
+ " print(\"Anahtar Kelimler:\")\n",
724
+ " for doc, words in keywords.items():\n",
725
+ " print(f\"{doc}: {words}\")\n",
726
+ " \n",
727
+ "\n",
728
+ "#---------------------------------------------------------\n",
729
+ " \"\"\"blobs = [tb(doc) for doc in documents_list] # veya 'title' kullanarak başlıkları işleyebilirsiniz\n",
730
+ " all_words = set(word for blob in blobs for word in blob.words)\n",
731
+ "\n",
732
+ " tfidf_scores = {}\n",
733
+ " for word in all_words:\n",
734
+ " tfidf_scores[word] = [Tf.tfidf(word, blob, blobs) for blob in blobs]\n",
735
+ "\n",
736
+ " print(\"TF-IDF Skorları:\")\n",
737
+ " for word, scores in tfidf_scores.items():\n",
738
+ " print(f\"Kelime: {word}, Skorlar: {scores}\")\"\"\"\n"
739
  ]
740
  },
741
  {
742
  "cell_type": "code",
743
+ "execution_count": 2,
744
  "metadata": {},
745
  "outputs": [
746
  {
747
+ "ename": "InvalidParameterError",
748
+ "evalue": "The 'stop_words' parameter of TfidfVectorizer must be a str among {'english'}, an instance of 'list' or None. Got {'o', 'den', 'an', 'şey', 'burada', 've', 'ah', 'ise', 'hiç', 'yine', 'biz', 'bu', 'da', 'genellikle', 'yılında', 'belirli', 'se', 'ne', 'kadar', 'neden', 'hem', 'aralar', 'yani', 'daha', 'araba', 'derken', 'dolayı', 'kısaca', 'karşı', 'niye', 'ki', 'bunu', 'buna', 'de', 'herhangi', 'önce', 'tabi', 'kime', 'biten', 'ben', 'ya', 'ya da', 'çünkü', 'mu', 'b', 'demek', 'fakat', 'şimdi', 'birlikte', 'her', 'bağlı', 'nasıl', 'şu', 'sadece', 'tüm', 'aslında', 'edilir', 'ama', 'bence', 'en', 'işte', 'gibi', 'ancak', 'birkaç', 'itibaren', 'mü', 'olabilir', 'bazı', 'oluşur', 'başlayan', 'yanı', 'olasılıkla', 'iyi', 'değil', 'eğer', 'yetenekli'} instead.",
749
+ "output_type": "error",
750
+ "traceback": [
751
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
752
+ "\u001b[1;31mInvalidParameterError\u001b[0m Traceback (most recent call last)",
753
+ "Cell \u001b[1;32mIn[2], line 155\u001b[0m\n\u001b[0;32m 152\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m tfidf_matrix, feature_names,documents_list \n\u001b[0;32m 154\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;18m__name__\u001b[39m\u001b[38;5;241m==\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m__main__\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m--> 155\u001b[0m tfidf_matrix, feature_names,documents_list\u001b[38;5;241m=\u001b[39m \u001b[43mmain\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 158\u001b[0m \u001b[38;5;66;03m# Sonuçları yazdır\u001b[39;00m\n\u001b[0;32m 159\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mİsimlendirilmiş Dökümanlar:\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
754
+ "Cell \u001b[1;32mIn[2], line 142\u001b[0m, in \u001b[0;36mmain\u001b[1;34m()\u001b[0m\n\u001b[0;32m 140\u001b[0m initial_stop_words \u001b[38;5;241m=\u001b[39m turkish_stop_words\n\u001b[0;32m 141\u001b[0m \u001b[38;5;66;03m# Stop-words listesini iteratif olarak güncelleyin\u001b[39;00m\n\u001b[1;32m--> 142\u001b[0m final_stop_words \u001b[38;5;241m=\u001b[39m \u001b[43miterative_update\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdocuments_list\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minitial_stop_words\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 143\u001b[0m \u001b[38;5;66;03m#tf-ıdf hesaplama\u001b[39;00m\n\u001b[0;32m 144\u001b[0m tfidf_matrix, feature_names\u001b[38;5;241m=\u001b[39mcalculate_tfidf(documents_list,final_stop_words)\n",
755
+ "Cell \u001b[1;32mIn[2], line 124\u001b[0m, in \u001b[0;36miterative_update\u001b[1;34m(documents, initial_stop_words, iterations)\u001b[0m\n\u001b[0;32m 122\u001b[0m stop_words \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m(initial_stop_words)\n\u001b[0;32m 123\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m _ \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(iterations):\n\u001b[1;32m--> 124\u001b[0m tfidf_matrix, feature_names \u001b[38;5;241m=\u001b[39m \u001b[43mcalculate_tfidf\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdocuments\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstop_words\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 125\u001b[0m low_tfidf_words \u001b[38;5;241m=\u001b[39m identify_low_tfidf_words(tfidf_matrix, feature_names)\n\u001b[0;32m 126\u001b[0m stop_words \u001b[38;5;241m=\u001b[39m update_stop_words(stop_words, low_tfidf_words)\n",
756
+ "Cell \u001b[1;32mIn[2], line 103\u001b[0m, in \u001b[0;36mcalculate_tfidf\u001b[1;34m(documents, stop_words)\u001b[0m\n\u001b[0;32m 101\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mcalculate_tfidf\u001b[39m(documents, stop_words):\n\u001b[0;32m 102\u001b[0m vectorizer \u001b[38;5;241m=\u001b[39m TfidfVectorizer(stop_words\u001b[38;5;241m=\u001b[39mstop_words, max_features\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m10000\u001b[39m)\n\u001b[1;32m--> 103\u001b[0m tfidf_matrix \u001b[38;5;241m=\u001b[39m \u001b[43mvectorizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_transform\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdocuments\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 104\u001b[0m feature_names \u001b[38;5;241m=\u001b[39m vectorizer\u001b[38;5;241m.\u001b[39mget_feature_names_out()\n\u001b[0;32m 105\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m tfidf_matrix, feature_names\n",
757
+ "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sklearn\\feature_extraction\\text.py:2091\u001b[0m, in \u001b[0;36mTfidfVectorizer.fit_transform\u001b[1;34m(self, raw_documents, y)\u001b[0m\n\u001b[0;32m 2084\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_params()\n\u001b[0;32m 2085\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_tfidf \u001b[38;5;241m=\u001b[39m TfidfTransformer(\n\u001b[0;32m 2086\u001b[0m norm\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnorm,\n\u001b[0;32m 2087\u001b[0m use_idf\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39muse_idf,\n\u001b[0;32m 2088\u001b[0m smooth_idf\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msmooth_idf,\n\u001b[0;32m 2089\u001b[0m sublinear_tf\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msublinear_tf,\n\u001b[0;32m 2090\u001b[0m )\n\u001b[1;32m-> 2091\u001b[0m X \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_transform\u001b[49m\u001b[43m(\u001b[49m\u001b[43mraw_documents\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 2092\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_tfidf\u001b[38;5;241m.\u001b[39mfit(X)\n\u001b[0;32m 2093\u001b[0m \u001b[38;5;66;03m# X is already a transformed view of raw_documents so\u001b[39;00m\n\u001b[0;32m 2094\u001b[0m \u001b[38;5;66;03m# we set copy to False\u001b[39;00m\n",
758
+ "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sklearn\\base.py:1466\u001b[0m, in \u001b[0;36m_fit_context.<locals>.decorator.<locals>.wrapper\u001b[1;34m(estimator, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1461\u001b[0m partial_fit_and_fitted \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m 1462\u001b[0m fit_method\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpartial_fit\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m _is_fitted(estimator)\n\u001b[0;32m 1463\u001b[0m )\n\u001b[0;32m 1465\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m global_skip_validation \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m partial_fit_and_fitted:\n\u001b[1;32m-> 1466\u001b[0m \u001b[43mestimator\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_validate_params\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1468\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m 1469\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m 1470\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m 1471\u001b[0m )\n\u001b[0;32m 1472\u001b[0m ):\n\u001b[0;32m 1473\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m fit_method(estimator, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
759
+ "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sklearn\\base.py:666\u001b[0m, in \u001b[0;36mBaseEstimator._validate_params\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 658\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_validate_params\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[0;32m 659\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Validate types and values of constructor parameters\u001b[39;00m\n\u001b[0;32m 660\u001b[0m \n\u001b[0;32m 661\u001b[0m \u001b[38;5;124;03m The expected type and values must be defined in the `_parameter_constraints`\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 664\u001b[0m \u001b[38;5;124;03m accepted constraints.\u001b[39;00m\n\u001b[0;32m 665\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m--> 666\u001b[0m \u001b[43mvalidate_parameter_constraints\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 667\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_parameter_constraints\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 668\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_params\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdeep\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 669\u001b[0m \u001b[43m \u001b[49m\u001b[43mcaller_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;18;43m__class__\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;18;43m__name__\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 670\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n",
760
+ "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sklearn\\utils\\_param_validation.py:95\u001b[0m, in \u001b[0;36mvalidate_parameter_constraints\u001b[1;34m(parameter_constraints, params, caller_name)\u001b[0m\n\u001b[0;32m 89\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 90\u001b[0m constraints_str \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m 91\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m, \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;241m.\u001b[39mjoin([\u001b[38;5;28mstr\u001b[39m(c)\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mfor\u001b[39;00m\u001b[38;5;250m \u001b[39mc\u001b[38;5;250m \u001b[39m\u001b[38;5;129;01min\u001b[39;00m\u001b[38;5;250m \u001b[39mconstraints[:\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m]])\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m or\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 92\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mconstraints[\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m]\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 93\u001b[0m )\n\u001b[1;32m---> 95\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m InvalidParameterError(\n\u001b[0;32m 96\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThe \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mparam_name\u001b[38;5;132;01m!r}\u001b[39;00m\u001b[38;5;124m parameter of \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mcaller_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m must be\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 97\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mconstraints_str\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m. Got \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mparam_val\u001b[38;5;132;01m!r}\u001b[39;00m\u001b[38;5;124m instead.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 98\u001b[0m )\n",
761
+ "\u001b[1;31mInvalidParameterError\u001b[0m: The 'stop_words' parameter of TfidfVectorizer must be a str among {'english'}, an instance of 'list' or None. Got {'o', 'den', 'an', 'şey', 'burada', 've', 'ah', 'ise', 'hiç', 'yine', 'biz', 'bu', 'da', 'genellikle', 'yılında', 'belirli', 'se', 'ne', 'kadar', 'neden', 'hem', 'aralar', 'yani', 'daha', 'araba', 'derken', 'dolayı', 'kısaca', 'karşı', 'niye', 'ki', 'bunu', 'buna', 'de', 'herhangi', 'önce', 'tabi', 'kime', 'biten', 'ben', 'ya', 'ya da', 'çünkü', 'mu', 'b', 'demek', 'fakat', 'şimdi', 'birlikte', 'her', 'bağlı', 'nasıl', 'şu', 'sadece', 'tüm', 'aslında', 'edilir', 'ama', 'bence', 'en', 'işte', 'gibi', 'ancak', 'birkaç', 'itibaren', 'mü', 'olabilir', 'bazı', 'oluşur', 'başlayan', 'yanı', 'olasılıkla', 'iyi', 'değil', 'eğer', 'yetenekli'} instead."
762
+ ]
763
  }
764
  ],
765
  "source": [
 
776
  " def get_mongodb():\n",
777
  " return 'mongodb://localhost:27017/', 'EgitimDatabase', 'train'\n",
778
  "\n",
779
+ "#--------------------------------------------------------------------------\n",
780
+ "#combined_text eklenmeli \n",
781
  " @staticmethod\n",
782
  " def get_input_documents(limit=3):\n",
783
  " mongo_url, db_name, collection_name = Database.get_mongodb()\n",
 
785
  " db = client[db_name]\n",
786
  " collection = db[collection_name]\n",
787
  " cursor = collection.find().limit(limit)\n",
788
+ " combined_text = [doc for doc in cursor]\n",
789
+ " document_count = len(combined_text)\n",
790
  " \n",
791
  " # Dökümanları isimlendir\n",
792
+ " named_documents = {f'döküman {i+1}': doc for i, doc in enumerate(combined_text)}\n",
793
  " \n",
794
  " return named_documents, document_count\n",
795
  "\n",
 
816
  " return Database.get_input_documents(limit)\n",
817
  "\n",
818
  "# Kullanım örneği\n",
819
+ "named_documents, document_count = Tf.get_input_documents(limit=1000)\n",
820
  "\n",
821
  "#tf-ıdf ile döküman içerisinden kelime seçme \n",
822
  "\n",
 
850
  " for word, score in sorted_words[:3]:\n",
851
  " print(\"\\tWord: {}, TF-IDF: {}\".format(word, round(score, 5)))\n",
852
  "\n",
853
+ "\n",
854
+ "#buraya eşik değer belirlenmeli\n",
855
  "turkish_stop_words = [\n",
856
  " 'ah', 'ama', 'an', 'ancak', 'araba', 'aralar', 'aslında', \n",
857
+ " 'b', 'başlayan', 'bağlı', 'bazı', 'belirli', 'ben', 'bence', \n",
858
+ " 'birkaç', 'birlikte', 'bunu', 'burada', 'biten', 'biz', \n",
859
+ " 'bu', 'buna', 'çünkü', 'da', 'de', 'demek', 'den', 'derken', \n",
860
+ " 'değil', 'daha', 'dolayı', 'edilir', 'eğer', 'en', 'fakat', \n",
861
+ " 'genellikle', 'gibi', 'hem', 'her', 'herhangi', 'hiç', 'ise', \n",
862
+ " 'işte', 'itibaren', 'iyi', 'kadar', 'karşı', 'ki', 'kime', \n",
863
+ " 'kısaca', 'mu', 'mü', 'nasıl', 'ne', 'neden', 'niye', 'o', \n",
864
+ " 'olasılıkla', 'olabilir', 'oluşur', 'önce', 'şu', 'sadece', \n",
865
+ " 'se', 'şey', 'şimdi', 'tabi', 'tüm', 've', 'ya', 'ya da', \n",
866
+ " 'yanı', 'yani', 'yılında', 'yetenekli', 'yine'\n",
867
  "]\n",
868
  "\n",
869
+ "#features olarak top_keywordsleri belirleyerek metnin bu kelimelerin etrafında olması sağlanmalı \n",
870
+ "def calculate_tfidf(combined_text, stop_words):\n",
871
  " vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=10000)\n",
872
+ " tfidf_matrix = vectorizer.fit_transform(combined_text)\n",
873
  " feature_names = vectorizer.get_feature_names_out()\n",
874
  " return tfidf_matrix, feature_names\n",
875
  "\n",
876
+ "#---------------------------------------------------------------------------------\n",
877
  "#kelimelerin ortalama skorlarını hesaplama \n",
878
  "def identify_low_tfidf_words(tfidf_matrix, feature_names, threshold=0.001):\n",
879
  " # TF-IDF skorlarını toplayarak her kelimenin ortalama skorunu hesaplayın\n",
 
881
  " low_tfidf_words = [feature_names[i] for i, score in enumerate(avg_scores) if score < threshold]\n",
882
  " return low_tfidf_words\n",
883
  "\n",
884
+ "#kelimelerin yeni geliştirilen eşik değere göre güncellenmesi \n",
885
  "def update_stop_words(existing_stop_words, low_tfidf_words):\n",
886
  " updated_stop_words = set(existing_stop_words) | set(low_tfidf_words)\n",
887
  " return list(updated_stop_words)\n",
888
  "\n",
889
  "\n",
890
+ "#bu kısım detaylandırılmalı \n",
891
+ "def iterative_update(combined_text, initial_stop_words, iterations=5):\n",
892
  " stop_words = set(initial_stop_words)\n",
893
  " for _ in range(iterations):\n",
894
+ " tfidf_matrix, feature_names = calculate_tfidf(combined_text, stop_words)\n",
895
  " low_tfidf_words = identify_low_tfidf_words(tfidf_matrix, feature_names)\n",
896
  " stop_words = update_stop_words(stop_words, low_tfidf_words)\n",
897
  " return list(stop_words)\n",
898
+ "\n",
899
  "\n",
900
  "\n",
901
  "def main ():\n",
902
  "\n",
903
+ " \n",
904
  "#anlam ilişkisini de kontrol edecek bir yapı oluşpturulacak title ile benzerlik kontrol ederek yüksek benzerlik içeren kelimler sıralnacak .\n",
905
  "\n",
906
  "# Dökümanları liste olarak al\n",
907
  " documents_list = [doc.get('text', '') if isinstance(doc, dict) else doc for doc in list(named_documents.values())]\n",
908
  "\n",
909
+ " #başlangıç stop değerleriyle yeni olanları arasında değişim yapma \n",
910
+ " initial_stop_words = turkish_stop_words\n",
911
+ " # Stop-words listesini iteratif olarak güncelleyin\n",
912
+ " final_stop_words = iterative_update(documents_list, initial_stop_words)\n",
913
  " #tf-ıdf hesaplama\n",
914
+ " tfidf_matrix, feature_names=calculate_tfidf(documents_list,final_stop_words)\n",
915
  "\n",
916
+ " \n",
 
917
  "\n",
918
+ " print(\"Güncellenmiş Stop-Words Listesi:\", final_stop_words)\n",
919
+ " print(\"TF-IDF Matrix Shape:\", tfidf_matrix.shape)\n",
920
+ " print(\"Feature Names Sample:\", feature_names[:10]) # İlk 10 feature adını gösterir\n",
921
  "\n",
922
+ " return tfidf_matrix, feature_names,documents_list \n",
 
923
  "\n",
924
+ "if __name__==\"__main__\":\n",
925
+ " tfidf_matrix, feature_names,documents_list= main()\n",
926
  "\n",
927
  "\n",
928
  "# Sonuçları yazdır\n",
929
+ "print(\"İsimlendirilmiş Dökümanlar:\")\n",
930
+ "for name, doc in named_documents.items():\n",
931
+ " print(f\"{name}: {doc}\")\n",
932
  "\n",
933
  " print(\"\\nDökümanlar Listesi:\")\n",
934
  " print(documents_list)\n",
 
1010
  },
1011
  {
1012
  "cell_type": "code",
1013
+ "execution_count": 1,
1014
  "metadata": {},
1015
+ "outputs": [
1016
+ {
1017
+ "ename": "NameError",
1018
+ "evalue": "name 'TfidfVectorizer' is not defined",
1019
+ "output_type": "error",
1020
+ "traceback": [
1021
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
1022
+ "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
1023
+ "Cell \u001b[1;32mIn[1], line 41\u001b[0m\n\u001b[0;32m 31\u001b[0m turkish_stop_words \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m([\n\u001b[0;32m 32\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124ma\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mabide\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mabi\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mabla\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mad\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124madım\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mah\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mama\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124man\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mancak\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124maraba\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124maralar\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124maslında\u001b[39m\u001b[38;5;124m'\u001b[39m, \n\u001b[0;32m 33\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124maşşağı\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124maz\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbazı\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbelirli\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mben\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbence\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbunu\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mburada\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbiz\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbu\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbuna\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mçünkü\u001b[39m\u001b[38;5;124m'\u001b[39m, \n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 37\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mönce\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mşu\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124msadece\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124msana\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mse\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mşey\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mşimdi\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtabi\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtüm\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mve\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mya\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mya da\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124myani\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124myine\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[0;32m 38\u001b[0m ])\n\u001b[0;32m 40\u001b[0m \u001b[38;5;66;03m# TF-IDF hesaplayıcı oluşturun ve Türkçe durak kelimelerini dahil edin\u001b[39;00m\n\u001b[1;32m---> 41\u001b[0m vectorizer \u001b[38;5;241m=\u001b[39m \u001b[43mTfidfVectorizer\u001b[49m(stop_words\u001b[38;5;241m=\u001b[39mturkish_stop_words)\n\u001b[0;32m 44\u001b[0m \u001b[38;5;124;03m\"\"\"IDF, derlemedeki belge sayısının,\u001b[39;00m\n\u001b[0;32m 45\u001b[0m \u001b[38;5;124;03mincelenen anahtar kelimeyi içeren topluluktaki belge sayısına \u001b[39;00m\n\u001b[0;32m 46\u001b[0m \u001b[38;5;124;03mbölünmesiyle elde edilen algoritmadır. \u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 49\u001b[0m \u001b[38;5;124;03mkülliyat yani incelenen tüm belgelerin adedi 10 ise ve test edilen anahtar kelime,\u001b[39;00m\n\u001b[0;32m 50\u001b[0m \u001b[38;5;124;03mkülliyattaki üç belgede görünüyorsa, bu durumda IDF değeri 0.52’dir (log (10/3)).\"\"\"\u001b[39;00m\n\u001b[0;32m 51\u001b[0m \u001b[38;5;66;03m#TF-IDF puanı; Naive Bayes ve Destek Vektör Makineleri gibi algoritmalara aktarılabilir. Böylece kelime sayısı gibi daha temel yöntemlerin sonuçları büyük ölçüde iyileştirilebilir.\u001b[39;00m\n\u001b[0;32m 52\u001b[0m \u001b[38;5;66;03m#IDF = log ( Dokuman Sayısı / Terimin Geçtiği Dokuman Sayısı )\u001b[39;00m\n\u001b[0;32m 53\u001b[0m \u001b[38;5;66;03m#dokuman sayısılarını almakla başlayacağız.\u001b[39;00m\n\u001b[0;32m 54\u001b[0m \u001b[38;5;66;03m# : titlelerın sayısı / terimler ise \u001b[39;00m\n",
1024
+ "\u001b[1;31mNameError\u001b[0m: name 'TfidfVectorizer' is not defined"
1025
+ ]
1026
+ }
1027
+ ],
1028
  "source": [
1029
  "\n",
1030
  "#---------------------------------------------------------------------------------------------------------------------------------\n",