abhisheky127 commited on
Commit
53a8c9e
1 Parent(s): 3cedfb4

testing new preprocess

Browse files
Files changed (1) hide show
  1. app.py +38 -11
app.py CHANGED
@@ -26,22 +26,49 @@ def zero_shot(doc, candidates):
26
  # return match.group(1).strip()
27
  # return None
28
 
29
- def preprocess(transaction):
30
- remove_words = ["pos", "mps", "bil", "onl"]
31
 
32
- # Convert to lowercase
33
- transaction = transaction.lower()
34
 
35
- # Remove unwanted words
36
- for word in remove_words:
37
- transaction = transaction.replace(word, "")
38
 
39
- # Remove special characters and digits
40
- transaction = re.sub(r"[^a-z\s]+", "", transaction)
41
 
 
 
 
 
 
 
 
 
 
 
 
42
  # Remove extra spaces
43
- transaction = re.sub(r"\s+", " ", transaction).strip()
44
- return transaction
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
 
47
  #create input and output objects
 
26
  # return match.group(1).strip()
27
  # return None
28
 
29
+ # def preprocess(transaction):
30
+ # remove_words = ["pos", "mps", "bil", "onl"]
31
 
32
+ # # Convert to lowercase
33
+ # transaction = transaction.lower()
34
 
35
+ # # Remove unwanted words
36
+ # for word in remove_words:
37
+ # transaction = transaction.replace(word, "")
38
 
39
+ # # Remove special characters and digits
40
+ # transaction = re.sub(r"[^a-z\s]+", "", transaction)
41
 
42
+ # # Remove extra spaces
43
+ # transaction = re.sub(r"\s+", " ", transaction).strip()
44
+ # return transaction
45
+
46
+ def preprocess(text):
47
+ # Remove digits
48
+ cleaned_text = re.sub(r'\d', '', text)
49
+
50
+ # Remove special characters except spaces and letters
51
+ cleaned_text = re.sub(r'[^a-zA-Z\s]', ' ', cleaned_text)
52
+
53
  # Remove extra spaces
54
+ cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
55
+
56
+ # Convert to uppercase
57
+ cleaned_text = cleaned_text.upper()
58
+
59
+ # Remove last word if it matches "PVT", "LTD", "INDIA"
60
+ last_word = cleaned_text.split()[-1]
61
+ if last_word in ["PVT", "LTD", "INDIA"]:
62
+ cleaned_text = " ".join(cleaned_text.split()[:-1])
63
+
64
+ # Remove unwanted words
65
+ words_to_remove = ["MPS", "POS", "BIL", "ONL"]
66
+ cleaned_text = " ".join([word for word in cleaned_text.split() if word not in words_to_remove])
67
+
68
+ # Convert to lowercase
69
+ cleaned_text = cleaned_text.lower()
70
+
71
+ return cleaned_text
72
 
73
 
74
  #create input and output objects