maahi2412 commited on
Commit
056a594
·
verified ·
1 Parent(s): d979d70

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -8
app.py CHANGED
@@ -15,9 +15,9 @@ from sklearn.metrics.pairwise import cosine_similarity
15
  app = Flask(__name__)
16
  CORS(app)
17
  UPLOAD_FOLDER = os.path.join(os.getcwd(), 'uploads')
18
- PEGASUS_MODEL_DIR = 'fine_tuned_pegasus'
19
- BERT_MODEL_DIR = 'fine_tuned_bert'
20
- LEGALBERT_MODEL_DIR = 'fine_tuned_legalbert'
21
  MAX_FILE_SIZE = 100 * 1024 * 1024
22
 
23
  if not os.path.exists(UPLOAD_FOLDER):
@@ -37,13 +37,11 @@ def load_or_finetune_pegasus():
37
  tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")
38
  model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")
39
 
40
- # Load and normalize datasets
41
  cnn_dm = load_dataset("cnn_dailymail", "3.0.0", split="train[:5000]").rename_column("article", "text").rename_column("highlights", "summary")
42
  xsum = load_dataset("xsum", split="train[:5000]", trust_remote_code=True).rename_column("document", "text")
43
  combined_dataset = concatenate_datasets([cnn_dm, xsum])
44
 
45
  def preprocess_function(examples):
46
- # Directly use normalized 'text' and 'summary' fields
47
  inputs = tokenizer(examples["text"], max_length=512, truncation=True, padding="max_length", return_tensors="pt")
48
  targets = tokenizer(examples["summary"], max_length=400, truncation=True, padding="max_length", return_tensors="pt")
49
  inputs["labels"] = targets["input_ids"]
@@ -54,7 +52,7 @@ def load_or_finetune_pegasus():
54
  eval_dataset = tokenized_dataset.select(range(8000, 10000))
55
 
56
  training_args = TrainingArguments(
57
- output_dir="./pegasus_finetune",
58
  num_train_epochs=3,
59
  per_device_train_batch_size=1,
60
  per_device_eval_batch_size=1,
@@ -117,7 +115,7 @@ def load_or_finetune_bert():
117
  eval_dataset = tokenized_dataset.select(range(int(0.8 * len(tokenized_dataset)), len(tokenized_dataset)))
118
 
119
  training_args = TrainingArguments(
120
- output_dir="./bert_finetune",
121
  num_train_epochs=3,
122
  per_device_train_batch_size=8,
123
  per_device_eval_batch_size=8,
@@ -180,7 +178,7 @@ def load_or_finetune_legalbert():
180
  eval_dataset = tokenized_dataset.select(range(int(0.8 * len(tokenized_dataset)), len(tokenized_dataset)))
181
 
182
  training_args = TrainingArguments(
183
- output_dir="./legalbert_finetune",
184
  num_train_epochs=3,
185
  per_device_train_batch_size=8,
186
  per_device_eval_batch_size=8,
 
15
  app = Flask(__name__)
16
  CORS(app)
17
  UPLOAD_FOLDER = os.path.join(os.getcwd(), 'uploads')
18
+ PEGASUS_MODEL_DIR = '/app/fine_tuned_pegasus'
19
+ BERT_MODEL_DIR = '/app/fine_tuned_bert'
20
+ LEGALBERT_MODEL_DIR = '/app/fine_tuned_legalbert'
21
  MAX_FILE_SIZE = 100 * 1024 * 1024
22
 
23
  if not os.path.exists(UPLOAD_FOLDER):
 
37
  tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")
38
  model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")
39
 
 
40
  cnn_dm = load_dataset("cnn_dailymail", "3.0.0", split="train[:5000]").rename_column("article", "text").rename_column("highlights", "summary")
41
  xsum = load_dataset("xsum", split="train[:5000]", trust_remote_code=True).rename_column("document", "text")
42
  combined_dataset = concatenate_datasets([cnn_dm, xsum])
43
 
44
  def preprocess_function(examples):
 
45
  inputs = tokenizer(examples["text"], max_length=512, truncation=True, padding="max_length", return_tensors="pt")
46
  targets = tokenizer(examples["summary"], max_length=400, truncation=True, padding="max_length", return_tensors="pt")
47
  inputs["labels"] = targets["input_ids"]
 
52
  eval_dataset = tokenized_dataset.select(range(8000, 10000))
53
 
54
  training_args = TrainingArguments(
55
+ output_dir="/app/pegasus_finetune",
56
  num_train_epochs=3,
57
  per_device_train_batch_size=1,
58
  per_device_eval_batch_size=1,
 
115
  eval_dataset = tokenized_dataset.select(range(int(0.8 * len(tokenized_dataset)), len(tokenized_dataset)))
116
 
117
  training_args = TrainingArguments(
118
+ output_dir="/app/bert_finetune",
119
  num_train_epochs=3,
120
  per_device_train_batch_size=8,
121
  per_device_eval_batch_size=8,
 
178
  eval_dataset = tokenized_dataset.select(range(int(0.8 * len(tokenized_dataset)), len(tokenized_dataset)))
179
 
180
  training_args = TrainingArguments(
181
+ output_dir="/app/legalbert_finetune",
182
  num_train_epochs=3,
183
  per_device_train_batch_size=8,
184
  per_device_eval_batch_size=8,