Alyosha11 commited on
Commit
c4d0a5f
1 Parent(s): 6455306

Upload extract.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. extract.py +34 -0
extract.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pyarrow.parquet as pq
3
+
4
+ def extract_parquet_files(directory):
5
+ # Create a directory to store the extracted CSV files
6
+ output_directory = "extracted_csv_files"
7
+ os.makedirs(output_directory, exist_ok=True)
8
+
9
+ # Iterate over files in the directory
10
+ for filename in os.listdir(directory):
11
+ # Check if the file has a .parquet extension
12
+ if filename.endswith(".parquet"):
13
+ file_path = os.path.join(directory, filename)
14
+
15
+ # Read the parquet file
16
+ table = pq.read_table(file_path)
17
+
18
+ # Extract the data from the parquet file
19
+ data = table.to_pandas()
20
+
21
+ # Generate the output CSV file path
22
+ csv_filename = os.path.splitext(filename)[0] + ".csv"
23
+ csv_file_path = os.path.join(output_directory, csv_filename)
24
+
25
+ # Save the extracted data as a CSV file
26
+ data.to_csv(csv_file_path, index=False)
27
+
28
+ print(f"Extracted data from {filename} saved as {csv_filename}")
29
+
30
+ # Directory containing the parquet files
31
+ parquet_directory = "hindi"
32
+
33
+ # Call the function to extract parquet files
34
+ extract_parquet_files(parquet_directory)