Spaces:
Paused
Paused
File size: 702 Bytes
2e748b3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 |
import re
def chunk_text(text,chunk_size=300,overlap=50):
words =text.split()
chunks=[]
i=0
while i<len(words):
chunk=words[i:i+chunk_size]
chunks.append(" ".join(chunk))
i+=chunk_size-overlap
return chunks
if __name__ =="__main__":
from step1_read_pdf import read_pdf
text=read_pdf("data/DST_Rapport_final_Reco_plant.pdf")
print(f"\n Longueur totale du texte : {len(text)} caractères")
chunks =chunk_text(text,chunk_size=300,overlap=50)
print(f"Nombre de chunks {len(chunks)}")
for i, chunk in enumerate(chunks[:3]):
print(f"\n Chunk {i+1} ({len(chunks)})")
print(chunk[:500], "..." if len(chunk)>500 else "")
|