jerpint commited on
Commit
fbf9436
1 Parent(s): b4b5bdf

add preprocess file (#2)

Browse files
Files changed (1) hide show
  1. data/preprocess_chunks.py +50 -0
data/preprocess_chunks.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+
4
+ def combine(x):
5
+ x = x.dropna(subset="content")
6
+ return pd.DataFrame(
7
+ {
8
+ "content": " ".join(x.content.to_list()),
9
+ "url": x.source.unique()[0],
10
+ "source": "towardsai_blog",
11
+ "title": x.title.unique()[0],
12
+ },
13
+ index=[0],
14
+ )
15
+
16
+
17
+ # recombine the chunks
18
+ filename = "output.csv"
19
+ df = pd.read_csv(filename)
20
+ df_combined = df.groupby("ID").apply(func=combine)
21
+ df_combined = df_combined.reset_index()
22
+
23
+ df_combined = df_combined.drop(columns=["level_1"])
24
+ df_combined.to_csv("chunks_preprocessed_combined.csv", index=False)
25
+
26
+ # Naive splitting the content into multiple rows based on word count
27
+ MAX_WORDS = 500
28
+ new_rows = []
29
+ for index, row in df_combined.iterrows():
30
+ content = row["content"].split()
31
+ num_chunks = (
32
+ len(content) - 1
33
+ ) // MAX_WORDS + 1 # Number of chunks based on MAX_WORDS
34
+
35
+ for i in range(num_chunks):
36
+ start_idx = i * MAX_WORDS
37
+ end_idx = (i + 1) * MAX_WORDS
38
+ new_content = " ".join(content[start_idx:end_idx])
39
+ new_row = row.copy()
40
+ new_row["content"] = new_content
41
+ new_rows.append(new_row)
42
+
43
+ # Creating a new DataFrame with the split rows
44
+ new_df = pd.DataFrame(new_rows)
45
+ new_df = new_df.reset_index()
46
+
47
+ # Drop a bunch of leftover useless columns
48
+ new_df = new_df.drop(columns=["index"])
49
+
50
+ new_df.to_csv("chunks_preprocessed.csv", index=False)