Spaces:
Running
Running
import pandas as pd | |
def combine(x): | |
x = x.dropna(subset="content") | |
return pd.DataFrame( | |
{ | |
"content": " ".join(x.content.to_list()), | |
"url": x.source.unique()[0], | |
"source": "towardsai_blog", | |
"title": x.title.unique()[0], | |
}, | |
index=[0], | |
) | |
# recombine the chunks | |
filename = "output.csv" | |
df = pd.read_csv(filename) | |
df_combined = df.groupby("ID").apply(func=combine) | |
df_combined = df_combined.reset_index() | |
df_combined = df_combined.drop(columns=["level_1"]) | |
df_combined.to_csv("chunks_preprocessed_combined.csv", index=False) | |
# Naive splitting the content into multiple rows based on word count | |
MAX_WORDS = 500 | |
new_rows = [] | |
for index, row in df_combined.iterrows(): | |
content = row["content"].split() | |
num_chunks = ( | |
len(content) - 1 | |
) // MAX_WORDS + 1 # Number of chunks based on MAX_WORDS | |
for i in range(num_chunks): | |
start_idx = i * MAX_WORDS | |
end_idx = (i + 1) * MAX_WORDS | |
new_content = " ".join(content[start_idx:end_idx]) | |
new_row = row.copy() | |
new_row["content"] = new_content | |
new_rows.append(new_row) | |
# Creating a new DataFrame with the split rows | |
new_df = pd.DataFrame(new_rows) | |
new_df = new_df.reset_index() | |
# Drop a bunch of leftover useless columns | |
new_df = new_df.drop(columns=["index"]) | |
new_df.to_csv("chunks_preprocessed.csv", index=False) | |