Spaces:

yonkasoft
/

makaleChatbotu

Build error

App Files Files Community

yonkasoft commited on 7 days ago

Commit

3a1020a

•

1 Parent(s): d1fb0c2

Upload 13 files

Browse files

Files changed (14) hide show

.gitattributes +5 -0
400.csv +3 -0
cleaned.csv +3 -0
dataframe.ipynb +349 -0
fiil(zemberek).py +20 -0
fiil.py +50 -0
köklendirme.py +1 -0
stop_words.ipynb +0 -0
stop_words.py +77 -0
stopwords.csv +3 -0
stopwords.txt +313 -0
temizleme.ipynb +267 -0
veriler_first_400k.csv +3 -0
veriler_tokenized.csv +3 -0

.gitattributes CHANGED Viewed

@@ -47,3 +47,8 @@ onBin.bson filter=lfs diff=lfs merge=lfs -text
 test.bson filter=lfs diff=lfs merge=lfs -text
 train.bson filter=lfs diff=lfs merge=lfs -text
 EgitimDatabase.test.csv filter=lfs diff=lfs merge=lfs -text

 test.bson filter=lfs diff=lfs merge=lfs -text
 train.bson filter=lfs diff=lfs merge=lfs -text
 EgitimDatabase.test.csv filter=lfs diff=lfs merge=lfs -text
+400.csv filter=lfs diff=lfs merge=lfs -text
+cleaned.csv filter=lfs diff=lfs merge=lfs -text
+stopwords.csv filter=lfs diff=lfs merge=lfs -text
+veriler_first_400k.csv filter=lfs diff=lfs merge=lfs -text
+veriler_tokenized.csv filter=lfs diff=lfs merge=lfs -text

400.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f5689ace1f5518d233d1fef1926e5bc3b002e34a5e02965c6db2e302e9d859b8
+size 1996328103

cleaned.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0b88070cb4c5d0246395e2246d3356c76733fa2f2a87f05157bc9322f843bee2
+size 1414512946

dataframe.ipynb ADDED Viewed

	@@ -0,0 +1,349 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "                                    _id       id  \\\n",
+      "0  {'$oid': '66c33a8c3b8bd216bd8ea93a'}  3525037   \n",
+      "1  {'$oid': '66c33a8c3b8bd216bd8ea93b'}  3532700   \n",
+      "2  {'$oid': '66c33a8c3b8bd216bd8ea93c'}  3203545   \n",
+      "3  {'$oid': '66c33a8c3b8bd216bd8ea93d'}  1765445   \n",
+      "4  {'$oid': '66c33a8c3b8bd216bd8ea93e'}   575462   \n",
+      "\n",
+      "                                                 url            title  \\\n",
+      "0  https://tr.wikipedia.org/wiki/P%C5%9F%C4%B1qo%...    Pşıqo Ahecaqo   \n",
+      "1      https://tr.wikipedia.org/wiki/Craterolophinae  Craterolophinae   \n",
+      "2           https://tr.wikipedia.org/wiki/Notocrabro       Notocrabro   \n",
+      "3    https://tr.wikipedia.org/wiki/Ibrahim%20Sissoko  Ibrahim Sissoko   \n",
+      "4        https://tr.wikipedia.org/wiki/Salah%20Cedid      Salah Cedid   \n",
+      "\n",
+      "                                                text  no  \n",
+      "0  Pşıqo Ahecaqo (), Çerkes siyasetçi, askeri kom...   0  \n",
+      "1  Craterolophinae, Depastridae familyasına bağlı...   1  \n",
+      "2  Notocrabro Crabronina oymağına bağlı bir cinst...   2  \n",
+      "3  İbrahim Sissoko (d. 30 Kasım 1991), Fildişi Sa...   3  \n",
+      "4  Salah Cedid (1926-1993) (Arapça: صلاح جديد) Su...   4  \n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "# CSV dosyasını yükleyelim\n",
+    "df = pd.read_csv('veriler.csv')\n",
+    "\n",
+    "# ID sütunu ekleyelim (her satıra 0'dan başlayarak benzersiz bir ID verelim)\n",
+    "df['no'] = df.index\n",
+    "\n",
+    "# Sonucu yeni bir CSV dosyasına kaydedelim\n",
+    "df.to_csv('data_with_id.csv', index=False)\n",
+    "\n",
+    "# İlk birkaç satırı kontrol edelim\n",
+    "print(df.head())\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Boş değer sayısı:\n",
+      "_id      0\n",
+      "id       0\n",
+      "url      0\n",
+      "title    0\n",
+      "text     0\n",
+      "dtype: int64\n",
+      "Tekrarlanan değer sayısı:\n",
+      "0\n",
+      "Eşleşmeyen 'title' sayısı: 0\n",
+      "Eşleşmeyen 'text' sayısı: 0\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "# Verileri yükleyin\n",
+    "df = pd.read_csv(\"common_400000.csv\")\n",
+    "\n",
+    "# Boş değerlerin kontrolü\n",
+    "print(\"Boş değer sayısı:\")\n",
+    "print(df.isnull().sum())\n",
+    "\n",
+    "# Tekrarlanan değerlerin kontrolü\n",
+    "print(\"Tekrarlanan değer sayısı:\")\n",
+    "print(df.duplicated(subset=['title', 'text']).sum())\n",
+    "\n",
+    "# Eşleşmeyen değerlerin kontrolü\n",
+    "unmatched_titles = df[df['text'].isna()]\n",
+    "print(f\"Eşleşmeyen 'title' sayısı: {len(unmatched_titles)}\")\n",
+    "\n",
+    "unmatched_texts = df[df['title'].isna()]\n",
+    "print(f\"Eşleşmeyen 'text' sayısı: {len(unmatched_texts)}\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Eksik 'text' değerlerini doldur\n",
+    "df['text'] = df['text'].fillna(\"Missing Text\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Tamamen aynı olan satırları kaldır\n",
+    "df = df.drop_duplicates(subset=['title', 'text'])\n",
+    "\n",
+    "# Sadece 'title' bazında tekrarlanan satırları kaldır (ilkini tutar)\n",
+    "df = df.drop_duplicates(subset=['title'], keep='first')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Text'i olmayan satırları kaldır\n",
+    "df = df.dropna(subset=['text'])\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Veriler başarıyla converted_file.csv olarak kaydedildi.\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "# JSON dosyasını yükleyin\n",
+    "json_file = 'EgitimDatabase.train.json'  # JSON dosyanızın adı\n",
+    "df = pd.read_json(json_file)  # JSON dosyasını DataFrame'e dönüştürme\n",
+    "\n",
+    "# DataFrame'i CSV olarak kaydetme\n",
+    "csv_file = 'converted_file.csv'  # Çıktı CSV dosya adı\n",
+    "df.to_csv(csv_file, index=False, encoding='utf-8')  # index olmadan ve UTF-8 formatında kaydedilir\n",
+    "\n",
+    "print(f\"Veriler başarıyla {csv_file} olarak kaydedildi.\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 416434 entries, 0 to 416433\n",
+      "Data columns (total 5 columns):\n",
+      " #   Column  Non-Null Count   Dtype \n",
+      "---  ------  --------------   ----- \n",
+      " 0   _id     416434 non-null  object\n",
+      " 1   id      416434 non-null  int64 \n",
+      " 2   url     416434 non-null  object\n",
+      " 3   title   416434 non-null  object\n",
+      " 4   text    416434 non-null  object\n",
+      "dtypes: int64(1), object(4)\n",
+      "memory usage: 15.9+ MB\n"
+     ]
+    }
+   ],
+   "source": [
+    "df.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Boş text satırları: 1\n",
+      "Boş title satırları: 0\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Boş değerleri kontrol etmek\n",
+    "empty_text_rows = df[df['text'].str.strip() == \"\"]\n",
+    "print(f\"Boş text satırları: {len(empty_text_rows)}\")\n",
+    "\n",
+    "empty_title_rows = df[df['title'].str.strip() == \"\"]\n",
+    "print(f\"Boş title satırları: {len(empty_title_rows)}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Geçerli satır sayısı: 416434\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Hem title hem text dolu olanları kontrol et\n",
+    "valid_rows = df[df['title'].notnull() & df['text'].notnull()]\n",
+    "print(f\"Geçerli satır sayısı: {len(valid_rows)}\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df['text'] = df['text'].apply(lambda x: str(x) if isinstance(x, dict) else x)\n",
+    "df['_id'] = df['_id'].apply(lambda x: str(x) if isinstance(x, dict) else x)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Tekrarlayan satır sayısı: 0\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Tekrarlayan satırları kontrol etmek\n",
+    "duplicated_rows = df[df.duplicated()]\n",
+    "print(f\"Tekrarlayan satır sayısı: {len(duplicated_rows)}\")\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Empty DataFrame\n",
+      "Columns: [_id, id, url, title, text]\n",
+      "Index: []\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Title ve text sütunlarında boş veya tutarsız değer var mı?\n",
+    "print(df[df['title'].isna() | df['text'].isna()])\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df['text'] = df['text'].fillna('Eksik veri')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "416434\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Sütundaki benzersiz değerleri sayma\n",
+    "print(df['title'].nunique())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "414397\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(df['text'].nunique())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

fiil(zemberek).py ADDED Viewed

	@@ -0,0 +1,20 @@

+from zemberek import TurkishMorphology
+# Create the TurkishMorphology object with default resources
+morphology = TurkishMorphology.create_with_defaults()
+# Word to analyze
+word = "katıldılar"
+print(f"Word = {word}")
+# Analyze the word
+results = morphology.analyze(word)
+# Iterate through the results and print various formats
+for result in results:
+    # Print the result in a detailed string format
+    print(f"Lexical and Surface: {result.format_string()}")
+    print(f"Stems: {result.get_stem()}")
+    #print(f"Lemmas: {result.get_lemma()}")
+    print()

fiil.py ADDED Viewed

	@@ -0,0 +1,50 @@

+from zemberek import TurkishMorphology
+from typing import List
+from functools import lru_cache
+morphology = TurkishMorphology.create()
+# Initialize the Turkish morphology analyzer as a global constant
+MORPHOLOGY = TurkishMorphology.create_with_default_resources()
+@lru_cache(maxsize=1000)  # Cache results for better performance
+def is_verb_or_verbform_zemberek(word: str) -> bool:
+    """
+    Check if a word is a verb or verbform using Zemberek analysis.
+    Args:
+        word (str): The word to analyze
+    Returns:
+        bool: True if the word is a verb or verbform, False otherwise
+    """
+    try:
+        # Get the analysis result for the word
+        result = MORPHOLOGY.analyze(word)
+        # Return True if there's at least one analysis and it's a verb
+        return bool(result and "Verb" in result[0].primary_pos.value)
+    except Exception as e:
+        print(f"Error analyzing word '{word}': {str(e)}")
+        return False
+def filter_verbs(words: List[str]) -> List[str]:
+    """
+    Filter a list of words to keep only verbs and verbforms.
+    Args:
+        words (List[str]): List of words to filter
+    Returns:
+        List[str]: List containing only verbs and verbforms
+    """
+    return [word for word in words if is_verb_or_verbform_zemberek(word)]
+def main():
+    # Example usage
+    words = ["geliyor", "gitti", "yapmak", "kitap"]
+    filtered_words = filter_verbs(words)
+    print(f"Verbs found: {filtered_words}")
+if __name__ == "__main__":
+    main()

köklendirme.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ #nltk kütüphanesiyle

stop_words.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

stop_words.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import pandas as pd
+def load_stopwords(file_path):
+    with open(file_path, 'r', encoding='utf-8') as f:
+        stopwords = f.read().splitlines()  # Her satır bir stopword olacak şekilde yükle
+    return set(stopwords)
+stop_words = load_stopwords('stopwords.txt')
+df = pd.read_csv('veriler_cleaned.csv')
+def remove_stopwords_without_nltk(text):
+    if isinstance(text, str):
+        words = text.split()
+        filtered_words = [word for word in words if word.lower() not in stop_words]
+        return ' '.join(filtered_words)
+    else:
+        return ""
+df['stopwords_text'] = df['cleaned_text'].apply(remove_stopwords_without_nltk)
+print(df[['cleaned_text', 'stopwords_text']].head())
+df.to_csv('temizlenmis_veri.csv', index=False)
+"""
+import pandas as pd
+import nltk
+from nltk.tokenize import word_tokenize
+nltk.download('stopwords')
+nltk.download('punkt')
+from nltk.corpus import stopwords
+stop_words = set(stopwords.words('turkish'))
+def load_custom_stopwords(file_path):
+    with open(file_path, 'r', encoding='utf-8') as f:
+        custom_stopwords = f.read().splitlines()
+    return set(custom_stopwords)
+custom_stopwords = load_custom_stopwords('stopwords.txt')
+stop_words.update(custom_stopwords)
+df = pd.read_csv('veriler_cleaned.csv')
+def remove_stopwords(text):
+    if isinstance(text, str):
+        words = word_tokenize(text)
+        filtered_words = [word for word in words if word.lower() not in stop_words]  # Stopwords'leri çıkar
+        return ' '.join(filtered_words)
+    else:
+        return ""
+df['stopwords_text'] = df['cleaned_text'].apply(remove_stopwords)
+print(df[['cleaned_text', 'stopwords_text']].head())
+df.to_csv('temizlenmis_veri.csv', index=False)
+"""

stopwords.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d9869903a48be79996282f11e06c3108160295b4f280cfd827fe1c18b6ea9fb4
+size 3212535134

stopwords.txt ADDED Viewed

	@@ -0,0 +1,313 @@

+Dış
+Bağlantılar
+eş
+asılarak
+anlamlıdır
+d
+nasıl
+ne
+en
+ve
+ile
+katılmıştır
+kullanıldı
+kullandı
+çıktı
+bulundu
+başlayan
+adlardaki
+vardı
+ekiyle
+eki
+oğlu
+olduğunu
+olduğunuz
+olduğunun
+tarihlerini
+öldü
+ölenler
+ölen
+Kaynakça
+vardı
+yaparlar
+yapardı
+yapmak
+yapabilmek
+yapılabilir
+Dış
+Bağlantılar
+kişiler
+kişilerin
+yılında
+yıllarında
+yılları
+boyunca
+süresince
+bağlı
+sonucu
+bitişi
+bitmiştir
+bittabi
+bitti
+yıl
+resmi
+asılarak
+bulundu
+olmuştur
+oldu
+süregelen
+süresince
+sürer
+nedeniyle
+nedeni
+neden
+nedeni
+açıklar
+bulunan
+önemli
+dayanmaktadır
+söylenmiştir
+söylenmektedir
+olması
+olmaması
+öne
+çıkar
+dikkat
+çeker
+dikkat
+çeken
+insanlar
+başladı
+başlayan
+yaşayanlar
+yaşayan
+varlığını
+sürdüren
+tamamlayıp
+eş anlamlı
+soy ismidir
+büyütüldü
+kullanıldı
+yapıldı
+lakapları
+lakaplıdır
+ismidir
+biten
+doğdu
+yaşadı
+içinde
+anlayışı
+aranan
+doğumlular
+doğumlu
+eski
+diyorlardı
+yer
+alan
+uygun
+yerlerde
+yerler
+yerleri
+yerlerin
+ilgili
+ili
+sanılmaktadır
+başlayan
+duyulan
+Bağlantılar
+bağlı
+Dış
+Dış
+Bağlantılar
+gidildi
+işgal edildi
+ünlüydü
+olduğunu
+olduğu
+olduğunuz
+olduğum
+onların
+onlar
+onlardan
+onlara
+doğumlular
+yaşayanlar
+ölenler
+türüdür
+varsaymak
+varsayılır
+adlandırılmaktadır
+kaynakça
+Kaynakça
+Biyografi
+katılmıştır
+getirmiştir
+girmiştir
+gitmişti
+gidilen
+gelen
+gelinen
+getirdim
+getirip
+getirdim
+geldi
+geldik
+geldikleri
+geldiklerinde
+geldiklerini
+geldikten
+geldiler
+geldim
+geldin
+geldiniz
+geldiği
+geldiğince
+geldiğini
+geldiğince
+geldiğinde
+geldiğini
+gelirdi
+geliri
+geldiniz
+gelmeden
+genelde
+gelince
+geleceğin
+biriydi
+biricik
+biridir
+birileri
+birilerinin
+birlikleri
+biriyle
+birinin
+biri
+sizi
+bizi
+biyografi
+beni
+lakap
+lakabıdır
+ama
+ancak
+artık
+aslında
+ayrıca
+bazı
+bütün
+çünkü
+daha
+de
+da
+gibi
+hem
+her
+hiç
+ile
+ise
+kez
+ki
+muhtemelen
+nasıl
+ne
+neden
+o
+onu
+onun
+onlar
+oysa
+pek
+peki
+siz
+u
+şunu
+sizin
+tam
+tüm
+veya
+ya
+yani
+yine
+yoksa
+çok
+az
+fazla
+başka
+belki
+kendi
+kendine
+kendini
+kendisi
+kadar
+önce
+sonra
+tekrar
+hep
+henüz
+hiç
+hemen
+yalnızca
+zaten
+şimdi
+tabii
+tabi
+zaten
+üzere
+gitmişti
+gidiyor
+gitti
+gidip
+gidince
+gidemedim
+gelen
+gelmişti
+geliyor
+gelince
+geldim
+geldin
+geldiler
+geldikten
+geldiğini
+gelir
+getiriyor
+getirdim
+getirdik
+getirdiler
+getiremedik
+getirtip
+getirtti
+getiren
+getirilen
+getirebilirim
+getirip
+getirt
+İsim
+ismidir
+ismi
+Bu
+şu
+Şu
+şuna
+şundan
+şunları
+şunu
+şöyle
+böyle
+ek
+bir
+biri
+biridir
+birileri
+birilerinin
+birşeyi
+birlikleri
+birkez
+birinin
+bitmiştir
+biten
+bitti
+bittabi
+birçok
+soyadları
+eş anlamlı
+eş anlamlıdır

temizleme.ipynb ADDED Viewed

	@@ -0,0 +1,267 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "400.000 satır üzerinde temizleme"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "                        _id       id  \\\n",
+      "0  66c33a8c3b8bd216bd8ea93a  3525037   \n",
+      "1  66c33a8c3b8bd216bd8ea93b  3532700   \n",
+      "2  66c33a8c3b8bd216bd8ea93c  3203545   \n",
+      "3  66c33a8c3b8bd216bd8ea93d  1765445   \n",
+      "4  66c33a8c3b8bd216bd8ea93e   575462   \n",
+      "\n",
+      "                                                 url            title  \\\n",
+      "0  https://tr.wikipedia.org/wiki/P%C5%9F%C4%B1qo%...    Pşıqo Ahecaqo   \n",
+      "1      https://tr.wikipedia.org/wiki/Craterolophinae  Craterolophinae   \n",
+      "2           https://tr.wikipedia.org/wiki/Notocrabro       Notocrabro   \n",
+      "3    https://tr.wikipedia.org/wiki/Ibrahim%20Sissoko  Ibrahim Sissoko   \n",
+      "4        https://tr.wikipedia.org/wiki/Salah%20Cedid      Salah Cedid   \n",
+      "\n",
+      "                                                text  \n",
+      "0  Pşıqo Ahecaqo (), Çerkes siyasetçi, askeri kom...  \n",
+      "1  Craterolophinae, Depastridae familyasına bağlı...  \n",
+      "2  Notocrabro Crabronina oymağına bağlı bir cinst...  \n",
+      "3  İbrahim Sissoko (d. 30 Kasım 1991), Fildişi Sa...  \n",
+      "4  Salah Cedid (1926-1993) (Arapça: صلاح جديد) Su...  \n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "# CSV dosyasını yükleme\n",
+    "df = pd.read_csv(\"EgitimDatabase.train.csv\")\n",
+    "\n",
+    "# Veriyi kontrol etme\n",
+    "print(df.head())  # İlk 5 satırı görmek için\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "                        _id       id  \\\n",
+      "0  66c33a8c3b8bd216bd8ea93a  3525037   \n",
+      "1  66c33a8c3b8bd216bd8ea93b  3532700   \n",
+      "2  66c33a8c3b8bd216bd8ea93c  3203545   \n",
+      "3  66c33a8c3b8bd216bd8ea93d  1765445   \n",
+      "4  66c33a8c3b8bd216bd8ea93e   575462   \n",
+      "\n",
+      "                                                 url            title  \\\n",
+      "0  https://tr.wikipedia.org/wiki/P%C5%9F%C4%B1qo%...    Pşıqo Ahecaqo   \n",
+      "1      https://tr.wikipedia.org/wiki/Craterolophinae  Craterolophinae   \n",
+      "2           https://tr.wikipedia.org/wiki/Notocrabro       Notocrabro   \n",
+      "3    https://tr.wikipedia.org/wiki/Ibrahim%20Sissoko  Ibrahim Sissoko   \n",
+      "4        https://tr.wikipedia.org/wiki/Salah%20Cedid      Salah Cedid   \n",
+      "\n",
+      "                                                text  \n",
+      "0  Pşıqo Ahecaqo (), Çerkes siyasetçi, askeri kom...  \n",
+      "1  Craterolophinae, Depastridae familyasına bağlı...  \n",
+      "2  Notocrabro Crabronina oymağına bağlı bir cinst...  \n",
+      "3  İbrahim Sissoko (d. 30 Kasım 1991), Fildişi Sa...  \n",
+      "4  Salah Cedid (1926-1993) (Arapça: صلاح جديد) Su...  \n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "# CSV dosyasından sadece ilk 400.000 satırı çekmek\n",
+    "df = pd.read_csv(\"EgitimDatabase.train.csv\", nrows=400000)\n",
+    "\n",
+    "# Veriyi kontrol etme\n",
+    "print(df.head())  # İlk 5 satırı görmek için\n",
+    "\n",
+    "# Yeni CSV dosyasına kaydetme\n",
+    "df.to_csv(\"veriler_first_400k.csv\", index=False)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Veri temizleme işlemi tamamlandı. Güncellenmiş dosya: 'veriler_updated.csv'\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "import re\n",
+    "#----------------------------------------------------------------------\n",
+    "input_file = \"veriler_first_400k.csv\"\n",
+    "\n",
+    "#-------------------------------------------------------------------------\n",
+    "def extract_removed_text(original_text, cleaned_text):\n",
+    "    \"\"\"Orijinal ve temizlenmiş metinler arasındaki farkı çıkarır.\"\"\"\n",
+    "    original_words = set(original_text.split())\n",
+    "    cleaned_words = set(cleaned_text.split())\n",
+    "    removed_words = original_words - cleaned_words\n",
+    "    return ' '.join(removed_words)\n",
+    "\n",
+    "#-------------------------------------------------------------------\n",
+    "def metni_temizle(metin):\n",
+    "    \"\"\"Yılları koruyarak metni temizler.\"\"\"\n",
+    "    temiz_metin = re.sub(r'\\b(?!\\d{4}\\b)\\d+', '', metin)  \n",
+    "    temiz_metin = re.sub(r'[^\\w\\s]', '', temiz_metin)   \n",
+    "    temiz_metin = re.sub(r'\\s+', ' ', temiz_metin)       \n",
+    "    return temiz_metin.strip()\n",
+    "\n",
+    "\n",
+    "chunk_size = 1000\n",
+    "error_rows = []\n",
+    "\n",
+    "chunks = pd.read_csv(input_file, chunksize=chunk_size, on_bad_lines='skip', delimiter=',', quotechar='\"', encoding='utf-8')\n",
+    "\n",
+    "#---------------> İşlenmiş tüm chunk'ları tutmak için liste\n",
+    "processed_chunks = []\n",
+    "\n",
+    "for chunk in chunks:\n",
+    "    # -------------->Hatalı verileri ayıkla\n",
+    "    error_chunk = chunk[chunk['text'].isna()]\n",
+    "    if not error_chunk.empty:\n",
+    "        error_rows.append(error_chunk)\n",
+    "    \n",
+    "    # ------------->Boş olmayan verileri temizle\n",
+    "    cleaned_chunk = chunk.dropna(subset=['text']).copy()\n",
+    "    \n",
+    "    \n",
+    "    cleaned_chunk['cleaned_text'] = cleaned_chunk['text'].apply(metni_temizle)\n",
+    "    cleaned_chunk['removed_text'] = cleaned_chunk.apply(\n",
+    "        lambda row: extract_removed_text(row['text'], row['cleaned_text']), axis=1\n",
+    "    )\n",
+    "    \n",
+    "    # ---------->İşlenmiş chunk'ları listeye ekle\n",
+    "    processed_chunks.append(cleaned_chunk)\n",
+    "\n",
+    "\n",
+    "final_df = pd.concat(processed_chunks, ignore_index=True)\n",
+    "\n",
+    "# -------------->Hatalı satırları ayır ve CSV olarak kaydet\n",
+    "if error_rows:\n",
+    "    error_df = pd.concat(error_rows, ignore_index=True)\n",
+    "    error_df.to_csv(\"error_rows.csv\", index=False)\n",
+    "\n",
+    "# Yeni sütunları ekleyerek ana CSV'yi güncelle\n",
+    "final_df.to_csv(\"veriler_updated.csv\", index=False)\n",
+    "\n",
+    "print(\"Veri temizleme işlemi tamamlandı. Güncellenmiş dosya: 'veriler_updated.csv'\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "                        _id       id  \\\n",
+      "0  66c33a8c3b8bd216bd8ea93a  3525037   \n",
+      "1  66c33a8c3b8bd216bd8ea93b  3532700   \n",
+      "2  66c33a8c3b8bd216bd8ea93c  3203545   \n",
+      "3  66c33a8c3b8bd216bd8ea93d  1765445   \n",
+      "4  66c33a8c3b8bd216bd8ea93e   575462   \n",
+      "\n",
+      "                                                 url            title  \\\n",
+      "0  https://tr.wikipedia.org/wiki/P%C5%9F%C4%B1qo%...    Pşıqo Ahecaqo   \n",
+      "1      https://tr.wikipedia.org/wiki/Craterolophinae  Craterolophinae   \n",
+      "2           https://tr.wikipedia.org/wiki/Notocrabro       Notocrabro   \n",
+      "3    https://tr.wikipedia.org/wiki/Ibrahim%20Sissoko  Ibrahim Sissoko   \n",
+      "4        https://tr.wikipedia.org/wiki/Salah%20Cedid      Salah Cedid   \n",
+      "\n",
+      "                                                text  \\\n",
+      "0  Pşıqo Ahecaqo (), Çerkes siyasetçi, askeri kom...   \n",
+      "1  Craterolophinae, Depastridae familyasına bağlı...   \n",
+      "2  Notocrabro Crabronina oymağına bağlı bir cinst...   \n",
+      "3  İbrahim Sissoko (d. 30 Kasım 1991), Fildişi Sa...   \n",
+      "4  Salah Cedid (1926-1993) (Arapça: صلاح جديد) Su...   \n",
+      "\n",
+      "                                        cleaned_text  \\\n",
+      "0  Pşıqo Ahecaqo Çerkes siyasetçi askeri komutan ...   \n",
+      "1  Craterolophinae Depastridae familyasına bağlı ...   \n",
+      "2  Notocrabro Crabronina oymağına bağlı bir cinst...   \n",
+      "3  İbrahim Sissoko d Kasım 1991 Fildişi Sahilili ...   \n",
+      "4  Salah Cedid 19261993 Arapça صلاح جديد Suriyeli...   \n",
+      "\n",
+      "                                        removed_text  \n",
+      "0  vardı. \"Beyoğlu\" 12.000 Ahecaqo'nun soylular\\n...  \n",
+      "1  \\n\\nDepastridae - bağlantılar\\n\\nKaynakça Clar...  \n",
+      "2  \\n\\nCrabronina bağlantılar\\n\\nKaynakça cinstir...  \n",
+      "3  €1.5 Jean-Alain kazandı. yaptı.\\n\\nEskişehirsp...  \n",
+      "4  tutuklatmıştır. 1966’dan sonrasında, girmiştir...  \n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "# CSV dosyasını yükleme\n",
+    "df = pd.read_csv(\"veriler_updated.csv\")\n",
+    "\n",
+    "# Tekrar eden verileri tespit etme\n",
+    "duplicated_values = df[\"cleaned_text\"][df[\"cleaned_text\"].duplicated(keep=False)]\n",
+    "\n",
+    "# Bu değerleri dataset'ten tamamen çıkarma\n",
+    "df_cleaned = df[~df[\"cleaned_text\"].isin(duplicated_values)]\n",
+    "\n",
+    "# Temizlenmiş veriyi kontrol etme (ilk 5 satır)\n",
+    "print(df_cleaned.head())\n",
+    "\n",
+    "# Temizlenmiş DataFrame'i yeni bir CSV dosyasına kaydetme\n",
+    "df_cleaned.to_csv(\"veriler_cleaned.csv\", index=False)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

veriler_first_400k.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3380f3c34d811c052e5e89f9e1a851c2b76f3137f4984dad17f529cbc91056fb
+size 756915538

veriler_tokenized.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:59e7ddd8570948af90754fcd5fd57adae4408a36d96b087f62f779f216f888bd
+size 2312831045