kolkata97 commited on
Commit
9d49094
1 Parent(s): 36b3d89

Update textsegmentation.py

Browse files
Files changed (1) hide show
  1. textsegmentation.py +4 -17
textsegmentation.py CHANGED
@@ -3,29 +3,16 @@ def textsegmentation():
3
  with open(contract_file_path, 'r') as file:
4
  contract_text = file.read()
5
 
6
- # Split the contract text into paragraphs
7
- paragraphs = re.split(r'\n\s*\n', contract_text)
8
-
9
- # Remove leading and trailing whitespace from each paragraph
10
- paragraphs = [paragraph.strip() for paragraph in paragraphs]
11
-
12
- # Remove line breaks within each paragraph
13
- paragraphs = [re.sub(r'\s+', ' ', paragraph) for paragraph in paragraphs]
14
- new_sentences.append(paragraphs)
15
-
16
- # Print the extracted clauses
17
- for i, clause in enumerate(paragraphs):
18
- print(f"Segment {i+1}: {clause}\n")
19
 
20
  # Prepare data for CSV
21
- #assign to data only the clause
22
- data = [(i+1, paragraph) for i, paragraph in enumerate(paragraphs)]
23
 
24
  # Write the data to CSV file
25
  with open(output_csv_file, 'w', newline='', encoding='utf-8') as file:
26
  writer = csv.writer(file)
27
- writer.writerow(['Segment ID', 'Segment Text']) # Write header
28
  writer.writerows(data)
29
 
30
-
31
  print("Output saved to CSV file.")
 
3
  with open(contract_file_path, 'r') as file:
4
  contract_text = file.read()
5
 
6
+ # Tokenize the contract text into sentences
7
+ sentences = nltk.sent_tokenize(contract_text)
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  # Prepare data for CSV
10
+ data = [(i+1, sentence) for i, sentence in enumerate(sentences)]
 
11
 
12
  # Write the data to CSV file
13
  with open(output_csv_file, 'w', newline='', encoding='utf-8') as file:
14
  writer = csv.writer(file)
15
+ writer.writerow(['Sentence ID', 'Sentence Text']) # Write header
16
  writer.writerows(data)
17
 
 
18
  print("Output saved to CSV file.")