Reham721 commited on
Commit
a5187e1
1 Parent(s): 286646b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +172 -4
app.py CHANGED
@@ -1,7 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
- iface = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- iface.launch()
 
1
+
2
+
3
+ !pip install sentencepiece==0.1.94
4
+ !pip install transformers
5
+ from transformers import AutoTokenizer
6
+
7
+ !pip install huggingface_hub
8
+ from huggingface_hub import notebook_login
9
+ notebook_login()
10
+
11
+ from transformers import AutoTokenizer
12
+ tokenizer1 = AutoTokenizer.from_pretrained("Reham721/Subjective_QG")
13
+ tokenizer2 = AutoTokenizer.from_pretrained("Reham721/MCQs")
14
+
15
+ from transformers import AutoModelForSeq2SeqLM
16
+
17
+ model1 = AutoModelForSeq2SeqLM.from_pretrained("Reham721/Subjective_QG")
18
+ model2 = AutoModelForSeq2SeqLM.from_pretrained("Reham721/MCQs")
19
+
20
+ !git clone https://github.com/aub-mind/arabert
21
+ !pip install pyarabic
22
+
23
+ from arabert.preprocess import ArabertPreprocessor
24
+ from transformers import pipeline
25
+
26
+ prep = ArabertPreprocessor("aubmindlab/araelectra-base-discriminator") #or empty string it's the same
27
+ qa_pipe =pipeline("question-answering",model="wissamantoun/araelectra-base-artydiqa")
28
+
29
+ def generate_questions(model,tokenizer, input_sequence):
30
+
31
+ # Tokenize input sequence
32
+ input_ids = tokenizer.encode(input_sequence, return_tensors='pt')
33
+
34
+ # Generate questions
35
+ outputs = model.generate(
36
+ input_ids=input_ids,
37
+ max_length=200, # Set a shorter maximum length for shorter questions
38
+ num_beams=3, # Use fewer beams for faster generation and to avoid overfitting
39
+ no_repeat_ngram_size=3, # Allow some repetition to avoid overly generic questions
40
+ early_stopping=True, # Stop generation when all beams are finished
41
+ temperature=1, # Use a lower temperature for more conservative questions
42
+ num_return_sequences=3, # Generate more questions per input
43
+ )
44
+
45
+ # Decode questions
46
+ questions = []
47
+ for output in outputs:
48
+ output_text = tokenizer.decode(output, skip_special_tokens=True)
49
+ questions.append(output_text)
50
+
51
+ return questions
52
+
53
+ def get_sorted_questions(questions, context):
54
+ dic = {}
55
+ context = prep.preprocess(context)
56
+ for question in questions:
57
+ print(question)
58
+ result = qa_pipe(question=question,context=context)
59
+ print(result)
60
+ dic.update({question: result["score"]})
61
+
62
+ return dict(sorted(dic.items(), key=lambda item: item[1], reverse=True))
63
+
64
+ !pip install arabic_reshaper
65
+ !pip install python-bidi
66
+
67
+ import unicodedata
68
+ import arabic_reshaper
69
+ from bidi.algorithm import get_display
70
+
71
+ def is_arabic(text):
72
+ # Reshape the text using the arabic_reshaper library
73
+ reshaped_text = arabic_reshaper.reshape(text)
74
+ # Determine the directionality of the text using the get_display() function from the bidi algorithm library
75
+ bidi_text = get_display(reshaped_text)
76
+ # Check if the text contains any non-Arabic letters
77
+ for char in bidi_text:
78
+ if char.isalpha() and unicodedata.name(char).startswith('ARABIC') == False:
79
+ return False
80
+ return True
81
+
82
+ import random
83
+ import re
84
+ def generate_distractors(question, answer, context, num_distractors=3, k=10):
85
+ input_sequence = f'{question} <sep> {answer} <sep> {context}'
86
+ input_ids = tokenizer2.encode(input_sequence, return_tensors='pt')
87
+
88
+ # Generate distractors using model.generate()
89
+ outputs = model2.generate(
90
+ input_ids,
91
+ do_sample=True, max_length=50, top_k=50, top_p=0.95, num_return_sequences=num_distractors, no_repeat_ngram_size=2)
92
+
93
+ # Convert outputs to list of strings
94
+ distractors = []
95
+ for output in outputs:
96
+ decoded_output = tokenizer2.decode(output, skip_special_tokens=True)
97
+ distractor_elements = [re.sub(r'<[^>]*>', '', element.strip()) for element in re.split(r'(<[^>]*>)|(?:None)', decoded_output) if element]
98
+ distractor_elements = [element for element in distractor_elements if element]
99
+ distractor_elements = [element for element in distractor_elements if is_arabic(element)]
100
+ distractors.append(distractor_elements)
101
+ distractors = [element for sublist in distractors for element in sublist]
102
+
103
+
104
+ # Remove duplicate distractors
105
+ unique_distractors = []
106
+ for distractor in distractors:
107
+ if distractor not in unique_distractors and distractor != answer:
108
+ unique_distractors.append(distractor)
109
+
110
+
111
+ # If there are not enough unique distractors, generate more until there are
112
+ while len(unique_distractors) < num_distractors:
113
+
114
+ outputs = model2.generate(
115
+ input_ids,
116
+ do_sample=True,
117
+ max_length=50,
118
+ top_k=50,
119
+ top_p=0.95,
120
+ num_return_sequences=num_distractors-len(unique_distractors),
121
+ no_repeat_ngram_size=2)
122
+ for output in outputs:
123
+ decoded_output = tokenizer2.decode(output, skip_special_tokens=True)
124
+ distractor_elements = [re.sub(r'<[^>]*>', '', element.strip()) for element in re.split(r'(<[^>]*>)|(?:None)', decoded_output) if element]
125
+ distractor_elements = [element for element in distractor_elements if element]
126
+ distractor_elements = [element for element in distractor_elements if is_arabic(element)]
127
+ if decoded_output not in unique_distractors and decoded_output not in unique_distractors and decoded_output != answer:
128
+ unique_distractors.append(decoded_output)
129
+ if len(unique_distractors) >= num_distractors:
130
+ break
131
+
132
+ random.shuffle(unique_distractors)
133
+
134
+ # Select k top distractors if more than k obtained in step 2
135
+ if len(unique_distractors) > k:
136
+ unique_distractors = sorted(unique_distractors, key=lambda x: random.random())[:k]
137
+
138
+ # Select num_distractors distractors
139
+ distractor_subset = random.sample(unique_distractors, num_distractors)
140
+
141
+ return distractor_subset
142
+
143
+ !pip install gradio
144
+
145
  import gradio as gr
146
 
147
+ context = gr.inputs.Textbox(lines=5,placeholder="Enter paragraph/context here...")
148
+ answer = gr.inputs.Textbox(lines=3, placeholder="Enter answer/keyword here...")
149
+ question_type = gr.inputs.Radio(choices=["Subjective", "MCQ"], label="Question type")
150
+ question = gr.outputs.Textbox( type="text", label="Question")
151
+
152
+ def generate_question(context,answer,question_type):
153
+ article = answer+"<sep>"+context
154
+ output = generate_questions(model1, tokenizer1, article)
155
+ result = get_sorted_questions(output, context)
156
+ if question_type == "Subjective":
157
+ return next(iter(result))
158
+ else:
159
+ mcqs = generate_distractors(question, answer, context)
160
+ return "-" + mcqs[0]+"\n"+"-" + mcqs[1]+"\n"+"-" + mcqs[2]+"\n"
161
+
162
+ iface = gr.Interface(
163
+ fn=generate_question,
164
+ inputs=[context,answer,question_type],
165
+ outputs=question,
166
+ list_outputs=True,
167
+ rtl=True)
168
+
169
+ iface.launch(debug=True,share=True) # will create a temporary sharable link
170
+
171
+ pip install --upgrade pip
172
+
173
+ pip install huggingface-spaces
174
 
175
+ !gradio deploy