Spaces:
Sleeping
Sleeping
paavansundar
commited on
Commit
•
3a2adb5
1
Parent(s):
e3d3ccf
Update app.py
Browse files
app.py
CHANGED
@@ -14,6 +14,26 @@ __model_output_path = "paavansundar/Medical_QNA_GPT2"
|
|
14 |
#prepare data
|
15 |
def prepareData():
|
16 |
df=pd.read_csv("MedQuAD.csv")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
# Create a Data collator object
|
19 |
data_collator = DataCollatorForLanguageModeling(tokenizer=__tokenizer, mlm=False, return_tensors="pt")
|
|
|
14 |
#prepare data
|
15 |
def prepareData():
|
16 |
df=pd.read_csv("MedQuAD.csv")
|
17 |
+
df['Question']=df['Question'].replace(r'^\s*$', np.nan, regex=True)
|
18 |
+
df['Answer']=df['Answer'].replace(r'^\s*$', np.nan, regex=True)
|
19 |
+
df = df.drop_duplicates(subset=['Question', 'Answer'])
|
20 |
+
df=df.dropna()
|
21 |
+
train_ds=df.groupby('Focus').head(100)
|
22 |
+
train_ds=train_ds.groupby('Focus').head(4).reset_index(drop=True)
|
23 |
+
test_ds=train_ds.groupby('Focus').head(1).reset_index(drop=True)
|
24 |
+
train_seq=list()
|
25 |
+
for i in range(len(train_ds)):
|
26 |
+
s='<question>'+train_ds.loc[i,'Question']+'<answer>'+train_ds.loc[i,'Answer']
|
27 |
+
train_seq.append(s)
|
28 |
+
val_seq=list()
|
29 |
+
for i in range(len(test_ds)):
|
30 |
+
s='<question>'+test_ds.loc[i,'Question']+'<answer>'+test_ds.loc[i,'Answer']
|
31 |
+
val_seq.append(s)
|
32 |
+
with open("train.txt", "w") as f:
|
33 |
+
f.writelines(line+'\n' for line in train_seq)
|
34 |
+
|
35 |
+
with open("val.txt", "w") as f:
|
36 |
+
f.writelines(line+'\n' for line in val_seq)
|
37 |
|
38 |
# Create a Data collator object
|
39 |
data_collator = DataCollatorForLanguageModeling(tokenizer=__tokenizer, mlm=False, return_tensors="pt")
|