Spaces:

towardsai-tutors
/

buster

Running

jerpint commited on Jul 27, 2023

Commit

fbf9436

•

1 Parent(s): b4b5bdf

add preprocess file (#2)

Files changed (1) hide show

data/preprocess_chunks.py ADDED Viewed

+import pandas as pd
+def combine(x):
+    x = x.dropna(subset="content")
+    return pd.DataFrame(
+        {
+            "content": " ".join(x.content.to_list()),
+            "url": x.source.unique()[0],
+            "source": "towardsai_blog",
+            "title": x.title.unique()[0],
+        },
+        index=[0],
+    )
+# recombine the chunks
+filename = "output.csv"
+df = pd.read_csv(filename)
+df_combined = df.groupby("ID").apply(func=combine)
+df_combined = df_combined.reset_index()
+df_combined = df_combined.drop(columns=["level_1"])
+df_combined.to_csv("chunks_preprocessed_combined.csv", index=False)
+# Naive splitting the content into multiple rows based on word count
+MAX_WORDS = 500
+new_rows = []
+for index, row in df_combined.iterrows():
+    content = row["content"].split()
+    num_chunks = (
+        len(content) - 1
+    ) // MAX_WORDS + 1  # Number of chunks based on MAX_WORDS
+    for i in range(num_chunks):
+        start_idx = i * MAX_WORDS
+        end_idx = (i + 1) * MAX_WORDS
+        new_content = " ".join(content[start_idx:end_idx])
+        new_row = row.copy()
+        new_row["content"] = new_content
+        new_rows.append(new_row)
+# Creating a new DataFrame with the split rows
+new_df = pd.DataFrame(new_rows)
+new_df = new_df.reset_index()
+# Drop a bunch of leftover useless columns
+new_df = new_df.drop(columns=["index"])
+new_df.to_csv("chunks_preprocessed.csv", index=False)