nneka commited on
Commit
2ebbf88
·
verified ·
1 Parent(s): 108d4f9

Delete memory_support_chatbot_for_pregnant_women.py

Browse files
memory_support_chatbot_for_pregnant_women.py DELETED
@@ -1,463 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- """Memory Support Chatbot for Pregnant Women.ipynb
3
-
4
- Automatically generated by Colaboratory.
5
-
6
- Original file is located at
7
- https://colab.research.google.com/drive/19GKLkfspyhjt-F2nq_1kMHdYZpco5Kzo
8
-
9
- #Proposal Title: Memory Support Chatbot for Pregnant Women.
10
-
11
- Project Summary:
12
- The proposed project is to develop a Memory Support Chatbox for Pregnant Women using the GPT-2 language model. The chatbox aims to provide support, solutions, tips, and advice for pregnant women experiencing cognitive memory issues. The goal is to offer a personalized and accessible resource for pregnant women to manage their cognitive memory challenges effectively.
13
-
14
- Proposed Project and Reasoning:
15
- The project is proposed to address the lack of easily accessible and personalized support for pregnant women facing cognitive memory issues. By leveraging the capabilities of the GPT-2 language model, the chatbox can provide instant responses and guidance, complementing the advice of healthcare professionals.
16
-
17
- *Dictionary-Based DataFrame creation method: This method reads the text files into a list of dictionaries and then creates a pandas DataFrame from these dictionaries using streamlit as the Gradio. Model is fine-tuned*
18
-
19
- Install Required Libraries
20
- """
21
-
22
- # Libraries
23
- !pip install streamlit # Streamlit for creating interactive web apps
24
- !pip install pandas # Pandas for data manipulation and analysis
25
- !pip install matplotlib # Matplotlib for data visualization
26
- !pip install nltk # NLTK for natural language processing tasks
27
- !pip install wordcloud # Wordcloud for generating word clouds
28
- !pip install transformers # Transformers for accessing the GPT-2 model #library for accessing the GPT-2 model
29
-
30
- """Import Libraries"""
31
-
32
- import pandas as pd # Pandas for data manipulation and analysis
33
- import nltk # NLTK for natural language processing tasks
34
- from nltk.corpus import stopwords # Stopwords from NLTK
35
- from nltk.tokenize import word_tokenize # Word tokenizer from NLTK
36
- import streamlit as st # Streamlit for creating interactive web apps
37
- import matplotlib.pyplot as plt # Matplotlib for data visualization
38
- from wordcloud import WordCloud # Wordcloud for generating word clouds
39
- from transformers import GPT2Tokenizer, GPT2LMHeadModel # GPT-2 model from Transformers
40
- import torch # PyTorch for deep learning tasks
41
- from torch.utils.data import DataLoader, Dataset,TensorDataset # DataLoader and Dataset for handling data
42
- from transformers import GPT2Config, GPT2LMHeadModel, AdamW # AdamW optimizer for GPT-2 model
43
- from transformers import AdamW, get_scheduler # Scheduler for optimizer
44
- from torch.nn.utils.rnn import pad_sequence # Padding sequences for model input
45
- from nltk.sentiment import SentimentIntensityAnalyzer # Sentiment analysis from NLTK
46
- from sklearn.feature_extraction.text import TfidfVectorizer # TF-IDF vectorizer
47
- from sklearn.decomposition import LatentDirichletAllocation # LDA for topic modeling
48
- nltk.download('vader_lexicon') # Download the VADER lexicon for sentiment analysis
49
- sia = SentimentIntensityAnalyzer() # Initialize the SentimentIntensityAnalyzer
50
-
51
- # # Load the pre-trained GPT-2 model and tokenizer
52
- tokenizer = GPT2Tokenizer.from_pretrained('gpt2') # GPT-2 tokenizer
53
- model = GPT2LMHeadModel.from_pretrained('gpt2') # GPT-2 model
54
- model.resize_token_embeddings(len(tokenizer)) # Resize token embeddings
55
-
56
- """#Data Collection and Preparation: Clean the dataset by removing special characters, digits, and unnecessary whitespace"""
57
-
58
- # List of file paths for the TXT files
59
- #Define the list of file paths for the TXT files
60
- txt_files = [
61
- "/content/Cognition in Pregnancy- Perceptions and Performance, 2005-2006 - Dataset - B2FIND.txt",
62
- "/content/Frontiers | Cognitive disorder and associated factors among pregnant women attending antenatal servi.txt",
63
- "/content/Frustrated By Brain Fog? How Pregnancy Actually Alters Yo....txt",
64
- "/content/Is Pregnancy Brain Real?.txt",
65
- "/content/Is ‘pregnancy brain’ real or just a myth? | Your Pregnancy Matters | UT Southwestern Medical Center.txt",
66
- "/content/Memory and affective changes during the antepartum- A narrative review and integrative hypothesis- J.txt",
67
- "/content/Pregnancy 'does cause memory loss' | Medical research | The Guardian.txt",
68
- "/content/Pregnancy Brain — Forgetfulness During Pregnancy.txt",
69
- "/content/Pregnancy brain- When it starts and what causes pregnancy brain fog | BabyCenter.txt",
70
- "/content/Pregnancy does cause memory loss, study says - CNN.txt",
71
- "/content/Textbook J.A. Russell, A.J. Douglas, R.J. Windle, C.D. Ingram - The Maternal Brain_ Neurobiological and Neuroendocrine Adaptation and Disorders in Pregnancy & Post Partum-Elsevier Science (2001).txt",
72
- "/content/The effect of pregnancy on maternal cognition - PMC.txt",
73
- "/content/This Is Your Brain on Motherhood - The New York Times.txt",
74
- "/content/Working memory from pregnancy to postpartum.txt",
75
- "/content/What Is Mom Brain and Is It Real?.txt",
76
- "/content/Memory loss in Pregnancy- Myth or Fact? - International Forum for Wellbeing in Pregnancy.txt",
77
- "/content/Memory and mood changes in pregnancy- a qualitative content analysis of women’s first-hand accounts.txt",
78
- "/content/Is Mom Brain real? Understanding and coping with postpartum brain fog.txt",
79
- "/content/Everyday Life Memory Deficits in Pregnant Women.txt",
80
- "/content/Cognitive Function Decline in the Third Trimester.txt",
81
- "/content/'Mommy brain' might be a good thing, new research suggests | CBC Radio.txt"
82
- ]
83
-
84
- #Load and read the text files into a DataFrame
85
- data = []
86
- for file_path in txt_files:
87
- with open(file_path, "r") as file:
88
- text = file.read()
89
- data.append({"text": text})
90
-
91
- df = pd.DataFrame(data)
92
-
93
- # Display the DataFrame
94
- print(df)
95
-
96
- """##Data Cleaning and Manipulation"""
97
-
98
- # Tokenize the text
99
- nltk.download('punkt') # Download the 'punkt' tokenizer models for tokenization
100
- df['tokens'] = df['text'].apply(word_tokenize) # Tokenize each text in the 'text' column into a list of words
101
-
102
- # Remove stopwords and special characters
103
- nltk.download('stopwords') # Download the stopwords corpus for English
104
- stop_words = set(stopwords.words('english')) # Load the English stopwords into a set
105
-
106
- # Apply a lambda function to each tokenized list in the 'tokens' column
107
- # This function converts each word to lowercase, removes non-alphanumeric characters, and filters out stopwords
108
- df['cleaned_text'] = df['tokens'].apply(lambda x: [word.lower() for word in x if (word.isalnum() and word.lower() not in stop_words)])
109
-
110
- # Join tokens back into sentences
111
- # Apply a lambda function to join the cleaned tokens back into a single string
112
- df['cleaned_text'] = df['cleaned_text'].apply(lambda x: ' '.join(x))
113
-
114
- # Display the cleaned text
115
- print(df['cleaned_text'])
116
-
117
- """##Exploratory Data Analysis and Visualization
118
-
119
- * Perform basic statistics on the dataset, such as word count, average length of articles
120
- """
121
-
122
- # Word count
123
- df['word_count'] = df['cleaned_text'].apply(lambda x: len(x.split()))
124
-
125
- # Average length of articles
126
- average_length = df['word_count'].mean()
127
-
128
- # Minimum and maximum word count
129
- min_word_count = df['word_count'].min()
130
- max_word_count = df['word_count'].max()
131
-
132
- print(f"Average length of articles: {average_length:.2f} words")
133
- print(f"Minimum word count: {min_word_count} words")
134
- print(f"Maximum word count: {max_word_count} words")
135
-
136
- # Display the cleaned text and word count
137
- print(df[['cleaned_text', 'word_count']])
138
-
139
- # Visualize the distribution of word counts
140
- plt.figure(figsize=(10, 6))
141
- plt.hist(df['word_count'], bins=20, color='brown', edgecolor='black')
142
- plt.xlabel('Word Count')
143
- plt.ylabel('Frequency')
144
- plt.title('Distribution of Word Counts')
145
- plt.show()
146
-
147
- """* Word Cloud (Display the most common words or phrases, providing a visual representation of the main themes and topics discussed in the articles)"""
148
-
149
- # Concatenate all cleaned text into a single string
150
- all_text = " ".join(df["cleaned_text"])
151
-
152
- # Generate the word cloud
153
- wordcloud = WordCloud(width=800, height=400, background_color="white").generate(all_text)
154
-
155
- # Display the word cloud
156
- plt.figure(figsize=(10, 5))
157
- plt.imshow(wordcloud, interpolation="bilinear")
158
- plt.axis("off")
159
- plt.title('Word Cloud of Cognitive Memory Issues')
160
- plt.show()
161
-
162
- """
163
-
164
- * Frequency of Cognitive Memory Issues (Describes the frequency of different types of cognitive memory issues reported in the text data. It shows the number of occurrences of each type of issue, such as memory loss, difficulty concentrating, forgetfulness, brain fog, and others, providing insights into the prevalence of these issues in the dataset)
165
-
166
- """
167
-
168
- def cleaned_text(text):
169
- # Your cleaning logic here
170
- cleaned_text = text.lower() # Example: convert text to lowercase
171
- return cleaned_text
172
-
173
- # Apply cleaning function to 'text' column and store in 'cleaned_text' column
174
- df["cleaned_text"] = df["text"].apply(cleaned_text)
175
-
176
- # Define types of cognitive memory issues
177
- types_of_issues = ['Memory Loss', 'Difficulty Concentrating', 'Forgetfulness', 'Brain Fog', 'Others']
178
-
179
- # Initialize frequencies dictionary
180
- frequencies = {issue: 0 for issue in types_of_issues}
181
-
182
- # Count frequencies of each type of issue
183
- for text in df["cleaned_text"]:
184
- for issue in types_of_issues:
185
- if issue.lower() in text: # Example: use lowercase for comparison
186
- frequencies[issue] += 1
187
-
188
- # Convert frequencies to DataFrame for plotting
189
- df_frequencies = pd.DataFrame(list(frequencies.items()), columns=['Types of cognitive memory issues', 'Frequency'])
190
-
191
- # Plot the bar chart
192
- plt.figure(figsize=(10, 6))
193
- plt.bar(df_frequencies['Types of cognitive memory issues'], df_frequencies['Frequency'], color='skyblue')
194
- plt.xlabel('Types of cognitive memory issues')
195
- plt.ylabel('Frequency')
196
- plt.title('Frequency of Cognitive Memory Issues')
197
- plt.xticks(rotation=45)
198
- plt.show()
199
-
200
- # Display the cleaned text
201
- print(df['cleaned_text'])
202
-
203
- """**Text Mining and NLP Analysis**
204
- Goal is to extract key terms and topics related to cognitive memory issues during pregnancy, providing insights into the content and themes of the articles. This analysis helps identify patterns, trends, and prevalent topics in the literature, which can inform the development of the chatbox and enhance its ability to provide relevant and personalized support to pregnant women.
205
- """
206
-
207
- # Text Mining and NLP
208
- # TF-IDF Vectorization
209
- tfidf_vectorizer = TfidfVectorizer(stop_words='english')
210
- tfidf_matrix = tfidf_vectorizer.fit_transform(df['cleaned_text'])
211
-
212
- # Latent Dirichlet Allocation (LDA) for Topic Modeling
213
- lda = LatentDirichletAllocation(n_components=5, random_state=42)
214
- lda.fit(tfidf_matrix)
215
-
216
- # Extract key terms and topics
217
- terms = tfidf_vectorizer.get_feature_names_out()
218
- topics = [[terms[i] for i in topic.argsort()[:-6:-1]] for topic in lda.components_]
219
-
220
- # Print topics
221
- for i, topic in enumerate(topics):
222
- print(f"Topic {i+1}: {', '.join(topic)}")
223
-
224
- """**Sentiment Analysis and Comparison and Contrast:** This analysis assigns a sentiment score to each article, indicating its overall sentiment (positive, neutral, or negative). The sentiment scores can help understand the general tone and attitude of the articles towards cognitive memory issues during pregnancy."""
225
-
226
- # Sentiment Analysis
227
- # Calculate the sentiment score for each cleaned text and store it in a new column
228
- df['sentiment_score'] = df['cleaned_text'].apply(lambda x: sia.polarity_scores(x)['compound'])
229
-
230
- # Explanation of sentiment scores:
231
- # -1 indicates extremely negative sentiment.
232
- # 0 indicates neutral sentiment.
233
- # 1 indicates extremely positive sentiment.
234
-
235
- # Comparison and Contrast
236
- # For simplicity, let's just plot the sentiment scores
237
- plt.figure(figsize=(10, 6))
238
- plt.hist(df['sentiment_score'], bins=20, color='green')
239
- plt.xlabel('Sentiment Score')
240
- plt.ylabel('Frequency')
241
- plt.title('Sentiment Analysis of Articles')
242
- plt.show()
243
-
244
- """#Comparing Baseline GPT-2 Model Responses Using Raw and Cleaned Text Before Fine-Tuning
245
-
246
- ##*For the baseline model with raw text:*
247
- """
248
-
249
- # Baseline GPT-2 Model Response before cleaning
250
- def baseline_generate_response(raw_text):
251
- # Tokenize the raw text
252
- input_ids = tokenizer(raw_text, return_tensors='pt')['input_ids']
253
- # Generate output
254
- output = model.generate(input_ids, max_length=100, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
255
- # Decode the output
256
- response = tokenizer.decode(output[0], skip_special_tokens=True)
257
- return response
258
-
259
- # Example usage
260
- raw_text1 = "How does pregnancy affect memory?"
261
- raw_text2 = "What are the effects of pregnancy on cognitive function?"
262
- baseline_response_raw1 = baseline_generate_response(raw_text1)
263
- baseline_response_raw2 = baseline_generate_response(raw_text2)
264
- print("Baseline Response (Raw Text 1):", baseline_response_raw1.rstrip('!'))
265
- print("Baseline Response (Raw Text 2):", baseline_response_raw2.rstrip('!'))
266
-
267
- """##*For the baseline model with cleaned text:*"""
268
-
269
- # Baseline GPT-2 Model Response #Generate responses using the GPT-2 model
270
- def baseline_generate_response(cleaned_text):
271
- # Tokenize the cleaned text
272
- input_ids = tokenizer.encode(cleaned_text, return_tensors='pt')
273
- # Generate output
274
- output = model.generate(input_ids, max_length=100, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id, attention_mask=input_ids.ne(tokenizer.eos_token_id))
275
- # Decode the output
276
- response = tokenizer.decode(output[0], skip_special_tokens=True)
277
- return response
278
-
279
- # Example usage
280
- cleaned_text1 = "How does pregnancy affect memory?"
281
- cleaned_text2 = "What are the effects of pregnancy on cognitive function?"
282
- baseline_response1 = baseline_generate_response(cleaned_text1)
283
- baseline_response2 = baseline_generate_response(cleaned_text2)
284
- print("Baseline Response 1:", baseline_response1)
285
- print("Baseline Response 2:", baseline_response2)
286
-
287
- """#Fine-Tuning the GPT-2 Model
288
-
289
- Tokenization and Padding for Fine-Tuning GPT-2 Model
290
- """
291
-
292
- # Define the maximum sequence length
293
- max_length = 512
294
-
295
- # Tokenize the cleaned text and truncate to max_length
296
- df['tokenized_text'] = df['cleaned_text'].apply(lambda x: tokenizer.encode(x[:max_length], return_tensors='pt'))
297
-
298
- # Get the padding value from the tokenizer or use a default value
299
- padding_value = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0
300
-
301
- # Pad or truncate the tokenized sequences to the maximum length
302
- padded_sequences = pad_sequence([seq.squeeze(0)[:max_length] for seq in df['tokenized_text']], batch_first=True, padding_value=padding_value)
303
-
304
- # Concatenate the padded sequences to match the expected size
305
- input_ids = torch.cat(tuple(padded_sequences), dim=0)
306
- labels = input_ids.clone()
307
-
308
- """Fine-Tuning: Ready to fine-tune the GPT-2 model"""
309
-
310
- #Check if 'input_ids' and 'labels' are defined
311
- if 'input_ids' in locals() and 'labels' in locals():
312
- print("input_ids and labels are defined.")
313
- else:
314
- print("input_ids and labels are not defined.")
315
-
316
- # Define hyperparameters
317
- num_epochs = 3
318
- learning_rate = 5e-5 # Adjusted learning rate
319
- weight_decay = 0.01 # Adjusted weight decay
320
- warmup_steps = 500 # Adjusted warmup steps
321
- max_seq_length = 1024 # Maximum sequence length
322
-
323
- # Define optimizer and scheduler
324
- optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
325
- scheduler = get_scheduler("linear", optimizer, num_warmup_steps=warmup_steps, num_training_steps=len(df) * num_epochs)
326
-
327
- """Manual Training Loop Method: In this method, the training loop is implemented manually, without using a custom trainer class."""
328
-
329
- # Fine-tune the GPT-2 model
330
- model.train()
331
- for epoch in range(num_epochs):
332
- total_loss = 0.0
333
- for text in df['cleaned_text']:
334
- input_ids = tokenizer.encode(text, return_tensors='pt', max_length=max_seq_length, truncation=True)
335
- optimizer.zero_grad()
336
- outputs = model(input_ids=input_ids, labels=input_ids)
337
- loss = outputs.loss
338
- total_loss += loss.item()
339
- loss.backward()
340
- optimizer.step()
341
- scheduler.step()
342
- average_loss = total_loss / len(df)
343
- print(f"Epoch {epoch+1}: Average Loss = {average_loss}")
344
-
345
- # Save the fine-tuned model
346
- model.save_pretrained('fine_tuned_gpt2_model')
347
-
348
- # Load the fine-tuned model
349
- fine_tuned_model = GPT2LMHeadModel.from_pretrained('fine_tuned_gpt2_model')
350
-
351
- """Comparison of Baseline and Fine-Tuned GPT-2 Model Responses
352
-
353
- """
354
-
355
- # Baseline GPT-2 Model Response
356
- def baseline_generate_response(input_text):
357
- # Tokenize the input text
358
- input_ids = tokenizer.encode(input_text, return_tensors='pt')
359
- # Generate output
360
- output = model.generate(input_ids, max_length=100, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id, attention_mask=input_ids.ne(tokenizer.eos_token_id))
361
- # Decode the output
362
- response = tokenizer.decode(output[0], skip_special_tokens=True)
363
- return response
364
-
365
- # Fine-Tuned GPT-2 Model Response
366
- def fine_tuned_generate_response(input_text):
367
- input_ids = tokenizer.encode(input_text, return_tensors='pt')
368
- output = fine_tuned_model.generate(input_ids, max_length=100, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id, attention_mask=input_ids.ne(tokenizer.eos_token_id))
369
- response = tokenizer.decode(output[0], skip_special_tokens=True)
370
- return response
371
-
372
- # Example usage
373
- input_text = "How does pregnancy affect memory?"
374
-
375
- baseline_response = baseline_generate_response(input_text)
376
- fine_tuned_response = fine_tuned_generate_response(input_text)
377
-
378
- print("Baseline Response:", baseline_response)
379
- print("Fine-Tuned Response:", fine_tuned_response)
380
-
381
- # Compare the responses
382
- print("Are the responses the same?")
383
- print(baseline_response == fine_tuned_response)
384
-
385
- """#Deployment: Chatbox Development: Define the Streamlit app interface and functionality"""
386
-
387
- # Chatbox interface
388
- st.title("Memory Support Chatbox for Pregnant Women")
389
- user_input = st.text_input("You:", "Enter your message here...")
390
- if user_input:
391
- input_ids = tokenizer.encode(user_input, return_tensors='pt')
392
- reply_ids = model.generate(input_ids, max_length=100, pad_token_id=tokenizer.eos_token_id)
393
- reply_text = tokenizer.decode(reply_ids[0], skip_special_tokens=True)
394
- st.text_area("Chatbot:", value=reply_text, height=200)
395
-
396
-
397
- # Text Analysis
398
- st.subheader("Text Analysis")
399
-
400
- # Word Cloud
401
- st.subheader("Word Cloud")
402
- all_text = " ".join(df["cleaned_text"])
403
- wordcloud = WordCloud(width=800, height=400, background_color="white").generate(all_text)
404
- plt.figure(figsize=(10, 5))
405
- plt.imshow(wordcloud, interpolation="bilinear")
406
- plt.axis("off")
407
- st.pyplot()
408
-
409
- """#Model Evaluation"""
410
-
411
- #Generate sample responses from the model to ensure it provides relevant and coherent advice
412
-
413
- # Sample prompts
414
- sample_prompts = [
415
- "What causes pregnancy brain fog?",
416
- "How does pregnancy affect the brain?",
417
- "How can I improve my memory during pregnancy?",
418
- "Can pregnancy brain fog affect my ability to work or perform daily tasks?",
419
- ]
420
-
421
- # Generate responses
422
- for prompt in sample_prompts:
423
- input_ids = tokenizer.encode(prompt, return_tensors='pt')
424
- output = model.generate(input_ids, max_length=100, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id, attention_mask=input_ids.ne(tokenizer.eos_token_id))
425
- response = tokenizer.decode(output[0], skip_special_tokens=True)
426
- print("Response:", response)
427
-
428
- """##Visualization of Model Responses to Sample Prompts
429
-
430
- Can help one understand the nature of the responses generated by the language model and
431
- identify common patterns or topics across the responses
432
- """
433
-
434
- #Sample prompts
435
- sample_prompts = [
436
- "What causes pregnancy brain fog?",
437
- "How does pregnancy affect the brain?",
438
- "How can I improve my memory during pregnancy?",
439
- "Can pregnancy brain fog affect my ability to work or perform daily tasks?",
440
- ]
441
-
442
- # Generate responses from the model
443
- data = []
444
- for prompt in sample_prompts:
445
- input_ids = tokenizer.encode(prompt, return_tensors='pt')
446
- output = model.generate(input_ids, max_length=100, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id, attention_mask=input_ids.ne(tokenizer.eos_token_id))
447
- response = tokenizer.decode(output[0], skip_special_tokens=True)
448
- data.append({"prompt": prompt, "response": response})
449
-
450
- # Create a DataFrame from the generated responses
451
- df = pd.DataFrame(data)
452
-
453
- # Display the DataFrame
454
- print(df)
455
-
456
- # Use the generated responses for visualization
457
- # For example, you can create a pie chart based on the responses
458
- # Pie chart
459
- plt.figure(figsize=(8, 8))
460
- plt.pie([len(response.split()) for response in df['response']], labels=df['prompt'], autopct='%1.1f%%', startangle=90)
461
- plt.title('Distribution of response lengths for sample prompts')
462
- plt.axis('equal')
463
- plt.show()