paavansundar commited on
Commit
3a2adb5
1 Parent(s): e3d3ccf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -0
app.py CHANGED
@@ -14,6 +14,26 @@ __model_output_path = "paavansundar/Medical_QNA_GPT2"
14
  #prepare data
15
  def prepareData():
16
  df=pd.read_csv("MedQuAD.csv")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
  # Create a Data collator object
19
  data_collator = DataCollatorForLanguageModeling(tokenizer=__tokenizer, mlm=False, return_tensors="pt")
 
14
  #prepare data
15
  def prepareData():
16
  df=pd.read_csv("MedQuAD.csv")
17
+ df['Question']=df['Question'].replace(r'^\s*$', np.nan, regex=True)
18
+ df['Answer']=df['Answer'].replace(r'^\s*$', np.nan, regex=True)
19
+ df = df.drop_duplicates(subset=['Question', 'Answer'])
20
+ df=df.dropna()
21
+ train_ds=df.groupby('Focus').head(100)
22
+ train_ds=train_ds.groupby('Focus').head(4).reset_index(drop=True)
23
+ test_ds=train_ds.groupby('Focus').head(1).reset_index(drop=True)
24
+ train_seq=list()
25
+ for i in range(len(train_ds)):
26
+ s='<question>'+train_ds.loc[i,'Question']+'<answer>'+train_ds.loc[i,'Answer']
27
+ train_seq.append(s)
28
+ val_seq=list()
29
+ for i in range(len(test_ds)):
30
+ s='<question>'+test_ds.loc[i,'Question']+'<answer>'+test_ds.loc[i,'Answer']
31
+ val_seq.append(s)
32
+ with open("train.txt", "w") as f:
33
+ f.writelines(line+'\n' for line in train_seq)
34
+
35
+ with open("val.txt", "w") as f:
36
+ f.writelines(line+'\n' for line in val_seq)
37
 
38
  # Create a Data collator object
39
  data_collator = DataCollatorForLanguageModeling(tokenizer=__tokenizer, mlm=False, return_tensors="pt")