Update textsegmentation.py
Browse files- textsegmentation.py +4 -17
textsegmentation.py
CHANGED
@@ -3,29 +3,16 @@ def textsegmentation():
|
|
3 |
with open(contract_file_path, 'r') as file:
|
4 |
contract_text = file.read()
|
5 |
|
6 |
-
#
|
7 |
-
|
8 |
-
|
9 |
-
# Remove leading and trailing whitespace from each paragraph
|
10 |
-
paragraphs = [paragraph.strip() for paragraph in paragraphs]
|
11 |
-
|
12 |
-
# Remove line breaks within each paragraph
|
13 |
-
paragraphs = [re.sub(r'\s+', ' ', paragraph) for paragraph in paragraphs]
|
14 |
-
new_sentences.append(paragraphs)
|
15 |
-
|
16 |
-
# Print the extracted clauses
|
17 |
-
for i, clause in enumerate(paragraphs):
|
18 |
-
print(f"Segment {i+1}: {clause}\n")
|
19 |
|
20 |
# Prepare data for CSV
|
21 |
-
|
22 |
-
data = [(i+1, paragraph) for i, paragraph in enumerate(paragraphs)]
|
23 |
|
24 |
# Write the data to CSV file
|
25 |
with open(output_csv_file, 'w', newline='', encoding='utf-8') as file:
|
26 |
writer = csv.writer(file)
|
27 |
-
writer.writerow(['
|
28 |
writer.writerows(data)
|
29 |
|
30 |
-
|
31 |
print("Output saved to CSV file.")
|
|
|
3 |
with open(contract_file_path, 'r') as file:
|
4 |
contract_text = file.read()
|
5 |
|
6 |
+
# Tokenize the contract text into sentences
|
7 |
+
sentences = nltk.sent_tokenize(contract_text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
# Prepare data for CSV
|
10 |
+
data = [(i+1, sentence) for i, sentence in enumerate(sentences)]
|
|
|
11 |
|
12 |
# Write the data to CSV file
|
13 |
with open(output_csv_file, 'w', newline='', encoding='utf-8') as file:
|
14 |
writer = csv.writer(file)
|
15 |
+
writer.writerow(['Sentence ID', 'Sentence Text']) # Write header
|
16 |
writer.writerows(data)
|
17 |
|
|
|
18 |
print("Output saved to CSV file.")
|