import os import pyarrow.parquet as pq from joblib import Parallel, delayed def extract_text_from_parquet(parquet_file, output_dir): # Read the Parquet file table = pq.read_table(parquet_file) # Convert the table to a Pandas DataFrame df = table.to_pandas() # Iterate over each row in the DataFrame for _, row in df.iterrows(): doc_id = row['doc_id'] text = row['text'] # Create the output file path output_file = os.path.join(output_dir, f"{doc_id}.txt") # Write the text to the output file with open(output_file, 'w', encoding='utf-8') as file: file.write(text) print(f"Extracted text for doc_id: {doc_id}") def process_parquet_file(parquet_file, parquet_directory, output_directory): parquet_file_path = os.path.join(parquet_directory, parquet_file) extract_text_from_parquet(parquet_file_path, output_directory) def main(): parquet_directory = 'hindi' output_directory = 'txt/' # Create the output directory if it doesn't exist os.makedirs(output_directory, exist_ok=True) # Get a list of all Parquet files in the directory parquet_files = [file for file in os.listdir(parquet_directory) if file.endswith('.parquet')] # Use joblib to parallelizes the extraction of text from Parquet files Parallel(n_jobs=-1)(delayed(process_parquet_file)(parquet_file, parquet_directory, output_directory) for parquet_file in parquet_files) if __name__ == '__main__': main()