arabic2english / src /data_processing /data_processing.py
alifalhasan's picture
[Task] Minor Update
b1c38c2 verified
raw
history blame
3.26 kB
import re
import spacy
import random
import pandas as pd
from torchtext import data
from spacy.lang.ar import Arabic
from spacy.tokenizer import Tokenizer
# Reading data into a pandas DataFrame
df = pd.read_csv(
"data/arabic2english.txt",
delimiter="\t",
names=["eng", "ar"],
)
# Loading English language model from spaCy
spacy_eng = spacy.load("en_core_web_sm")
# Creating an instance of Arabic language model from spaCy
arab = Arabic()
# Creating a tokenizer for Arabic text using the Arabic language model
ar_Tokenizer = Tokenizer(arab.vocab)
def engTokenizer(text):
"""
Tokenizes English text using spaCy tokenizer.
Args:
text (str): The input English text.
Returns:
list: List of tokens.
"""
return [word.text for word in spacy_eng.tokenizer(text)]
def arTokenizer(sentence):
"""
Tokenizes Arabic sentence using spaCy tokenizer.
Args:
sentence (str): The input Arabic sentence.
Returns:
list: List of tokens.
"""
return [
word.text
for word in ar_Tokenizer(
re.sub(r"\s+", " ", re.sub(r"[\.\'\"\n+]", " ", sentence)).strip()
)
]
# Defining fields for source and target languages using torchtext
SRC = data.Field(
tokenize=engTokenizer, batch_first=False, init_token="<sos>", eos_token="<eos>"
)
TRG = data.Field(
tokenize=arTokenizer,
batch_first=False,
tokenizer_language="ar",
init_token="بداية",
eos_token="نهاية",
)
class TextDataset(data.Dataset):
"""
Custom dataset class for text data.
Args:
df (pandas.DataFrame): DataFrame containing source and target language data.
src_field (torchtext.data.Field): Field for source language.
target_field (torchtext.data.Field): Field for target language.
is_test (bool): Flag indicating if the dataset is for testing.
Attributes:
fields (list): List of tuples containing field names and corresponding Field objects.
samples (list): List of data examples.
"""
def __init__(self, df, src_field, target_field, is_test=False, **kwargs):
fields = [("eng", src_field), ("ar", target_field)]
samples = []
for i, row in df.iterrows():
eng = row.eng
ar = row.ar
samples.append(data.Example.fromlist([eng, ar], fields))
super().__init__(samples, fields, **kwargs)
def __len__(self):
"""
Get the number of samples in the dataset.
Returns:
int: Number of samples.
"""
return len(self.samples)
def __getitem__(self, idx):
"""
Get a sample from the dataset.
Args:
idx (int): Index of the sample.
Returns:
torchtext.data.Example: Sample at the specified index.
"""
return self.samples[idx]
# Creating a TextDataset instance
torchdataset = TextDataset(df, SRC, TRG)
# Splitting the dataset into training and validation sets
train_data, valid_data = torchdataset.split(
split_ratio=0.8, random_state=random.seed(32)
)
# Building vocabularies for source and target languages
SRC.build_vocab(train_data, min_freq=2)
TRG.build_vocab(train_data, min_freq=2)