kreem22 commited on
Commit
5042eab
1 Parent(s): cdac087

Upload 7 files

Browse files
FileUtil.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle
2
+ import os
3
+
4
+
5
+ def write_to_file(data, filename):
6
+ with open(filename, 'wb') as file:
7
+ pickle.dump(data, file)
8
+
9
+
10
+ def read_from_file(filename):
11
+ if not os.path.exists(filename):
12
+ open(filename, 'w').close() # Create a blank file if it doesn't exist
13
+ return []
14
+ with open(filename, 'rb') as file:
15
+ try:
16
+ data = pickle.load(file)
17
+ except:
18
+ print("An exception occurred")
19
+ return []
20
+ return data
MistralaiChat.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import InferenceClient
2
+ import gradio as gr
3
+ import random
4
+ import FileUtil
5
+
6
+ client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
7
+
8
+
9
+ def format_prompt(message, history):
10
+ prompt = "<s>"
11
+ for user_prompt, bot_response in history:
12
+ prompt += f"[INST] {user_prompt} [/INST]"
13
+ prompt += f" {bot_response}</s> "
14
+ prompt += f"[INST] {message} [/INST]"
15
+
16
+ return prompt
17
+
18
+
19
+ def generate(
20
+ prompt, history, system_prompt, temperature=0.9, max_new_tokens=256, top_p=0.95, repetition_penalty=1.0,
21
+ ):
22
+
23
+ hist_len = len(history)
24
+ if hist_len == 0:
25
+ history = FileUtil.read_from_file("history.bin")
26
+
27
+ temperature = float(temperature)
28
+ if temperature < 1e-2:
29
+ temperature = 1e-2
30
+ top_p = float(top_p)
31
+
32
+ generate_kwargs = dict(
33
+ temperature=temperature,
34
+ max_new_tokens=max_new_tokens,
35
+ top_p=top_p,
36
+ repetition_penalty=repetition_penalty,
37
+ do_sample=True,
38
+ seed=random.randint(1, 99999)
39
+ # seed=42,
40
+ )
41
+ print(" your prompt: "+prompt)
42
+ formatted_prompt = format_prompt(f"{system_prompt}, {prompt}", history)
43
+ stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True,
44
+ return_full_text=False)
45
+ output = ""
46
+
47
+ FileUtil.write_to_file(history, 'history.bin')
48
+
49
+ for response in stream:
50
+ output += response.token.text
51
+ yield output
52
+ return output
53
+
54
+
55
+ additional_inputs = [
56
+ gr.Textbox(
57
+ label="System Prompt",
58
+ max_lines=1,
59
+ interactive=True,
60
+ ),
61
+ gr.Slider(
62
+ label="Temperature",
63
+ value=0.9,
64
+ minimum=0.0,
65
+ maximum=1.0,
66
+ step=0.05,
67
+ interactive=True,
68
+ info="Higher values produce more diverse outputs",
69
+ ),
70
+ gr.Slider(
71
+ label="Max new tokens",
72
+ value=1048,
73
+ minimum=0,
74
+ maximum=1048 * 10,
75
+ step=64,
76
+ interactive=True,
77
+ info="The maximum numbers of new tokens",
78
+ ),
79
+ gr.Slider(
80
+ label="Top-p (nucleus sampling)",
81
+ value=0.90,
82
+ minimum=0.0,
83
+ maximum=1,
84
+ step=0.05,
85
+ interactive=True,
86
+ info="Higher values sample more low-probability tokens",
87
+ ),
88
+ gr.Slider(
89
+ label="Repetition penalty",
90
+ value=1.2,
91
+ minimum=1.0,
92
+ maximum=2.0,
93
+ step=0.05,
94
+ interactive=True,
95
+ info="Penalize repeated tokens",
96
+ )
97
+ ]
98
+
99
+ examples = [[
100
+ "I'm planning a vacation to Japan. Can you suggest a one-week itinerary including must-visit places and local cuisines to try?",
101
+ None, None, None, None, None, ],
102
+ ["Can you write a short story about a time-traveling detective who solves historical mysteries?", None,
103
+ None, None, None, None, ],
104
+ [
105
+ "I'm trying to learn French. Can you provide some common phrases that would be useful for a beginner, along with their pronunciations?",
106
+ None, None, None, None, None, ],
107
+ [
108
+ "I have chicken, rice, and bell peppers in my kitchen. Can you suggest an easy recipe I can make with these ingredients?",
109
+ None, None, None, None, None, ],
110
+ ["Can you explain how the QuickSort algorithm works and provide a Python implementation?", None, None, None,
111
+ None, None, ],
112
+ [
113
+ "What are some unique features of Rust that make it stand out compared to other systems programming languages like C++?",
114
+ None, None, None, None, None, ],
115
+ ]
116
+
117
+ gr.ChatInterface(
118
+ fn=generate,
119
+ chatbot=gr.Chatbot(show_label=True, show_share_button=True, show_copy_button=True, likeable=True, layout="panel"),
120
+ additional_inputs=additional_inputs,
121
+ title="Chat Example",
122
+ examples=examples,
123
+ concurrency_limit=20,
124
+ ).launch(show_api=True, share=True)
adamneveml/dataset.csv ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ id,text,sentiment,length,language,source,gender,age_group,time_period,category,entities,question,answer
2
+ 1,Horrible pizza at this place.,-1,32,ENG,Twitter,Female,YoungAdult,2023-Q2,review,"['pizza', 'place']",Is this a bad restaurant review?,FALSE
3
+ 2,"The first manned mission landed on the moon on July 20th, 1969.",0,24,ENG,Article,Male,Adult,2021-Q1,science,"['apollo', 'moon', 'landing', 'July 20th, 1969']",Did humans land on Mars?,FALSE
4
+ 3,"Can machines think? Or more specifically, can robots drive cars?",1,25,ENG,Blog,Other,GenX,2015-Q4,qa,[],Can robots drive cars?,TRUE
adamneveml/datasetcsv.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # create_toy_dataset.py
2
+ from random import randrange
3
+
4
+ import pandas as pd
5
+ from datetime import date, timedelta
6
+
7
+
8
+ def get_random_date():
9
+ """ Generate random time period dates."""
10
+ return date(2015, 1, 1) + timedelta(
11
+ seconds=int(round((date.today() - date(2015, 1, 1)).total_seconds() * randrange(50, 100))))
12
+
13
+
14
+ # Define the dataset as a list of dictionaries
15
+ toy_dataset = [
16
+ {
17
+ 'id': 1,
18
+ 'text': 'Horrible pizza at this place.',
19
+ 'sentiment': -1,
20
+ 'length': 32,
21
+ 'language': 'ENG',
22
+ 'source': 'Twitter',
23
+ 'gender': 'Female',
24
+ 'age_group': 'YoungAdult',
25
+ 'time_period': get_random_date(),
26
+ 'category': 'review',
27
+ 'entities': ['pizza', 'place'],
28
+ 'question': "Is this a bad restaurant review?",
29
+ 'answer': False
30
+ },
31
+ {
32
+ 'id': 2,
33
+ 'text': "The first manned mission landed on the moon on July 20th, 1969.",
34
+ 'sentiment': 0,
35
+ 'length': 24,
36
+ 'language': 'ENG',
37
+ 'source': 'Article',
38
+ 'gender': 'Male',
39
+ 'age_group': 'Adult',
40
+ 'time_period': '2021-Q1',
41
+ 'category': 'science',
42
+ 'entities': ['apollo', 'moon', 'landing', 'July 20th, 1969'],
43
+ 'question': "Did humans land on Mars?",
44
+ 'answer': False
45
+ },
46
+ {
47
+ 'id': 3,
48
+ 'text': "Can machines think? Or more specifically, can robots drive cars?",
49
+ 'sentiment': 1,
50
+ 'length': 25,
51
+ 'language': 'ENG',
52
+ 'source': 'Blog',
53
+ 'gender': 'Other',
54
+ 'age_group': 'GenX',
55
+ 'time_period': '2015-Q4',
56
+ 'category': 'qa',
57
+ 'entities': [],
58
+ 'question': "Can robots drive cars?",
59
+ 'answer': True
60
+ }
61
+ ]
62
+
63
+ # Convert the dataset list into a DataFrame
64
+ df = pd.DataFrame(toy_dataset)
65
+
66
+ # Write DataFrame contents into a CSV file
67
+ output_file = 'dataset.csv'
68
+ df.to_csv(output_file, index=False)
69
+ print(f'Successfully created "{output_file}"!')
adamneveml/features.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b80cf203e6c8a28aac140f8335c021cc12a567cc9becd21ce8ce20d9f680e449
3
+ size 849
adamneveml/purify.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import csv
2
+ import math
3
+ import pickle
4
+ from collections import Counter
5
+ from itertools import chain
6
+
7
+ import nltk
8
+ import numpy as np
9
+ import unicodedata
10
+
11
+ nltk.download('stopwords')
12
+ nltk.download('punkt')
13
+
14
+ with open('dataset.csv', 'r', encoding='utf-8') as f:
15
+ input_reader = csv.DictReader(f)
16
+ sentences = [row['text'] for row in input_reader]
17
+
18
+
19
+ # Remove diacritics from all charactrs
20
+ def remove_diacritics(text):
21
+ text = unicodedata.normalize('NFKD', text) \
22
+ .encode('ascii', 'ignore') \
23
+ .decode('utf-8', 'ignore')
24
+ return text
25
+
26
+
27
+ def filter_non_alphabeticals(words):
28
+ return [w for w in words if w and w.isalpha()]
29
+
30
+
31
+ # Lowercase and concatenate words for bigram extraction
32
+ def flatten_bigram_components(text):
33
+ return ' '.join([word.lower().strip() for word in nltk.word_tokenize(text)]).split()
34
+
35
+
36
+ # Create bigram frequency distribution
37
+ def get_top_biagrams(counter, num=5):
38
+ bigrams = [(k[0] + ' ' + k[1], v) for k, v in counter.items()]
39
+ sorted_biagrams = sorted(bigrams, reverse=True, key=(lambda kv: kv[1]))[:num]
40
+ return sorted_biagrams
41
+
42
+
43
+ # Calculate TF-IDF weightings for term importance
44
+ def tf_idf(term_freqs, inverse_docs):
45
+ weights = {}
46
+ for term, freq in term_freqs.items():
47
+ denominator = sum([inv_doc[term] for inv_doc in inverse_docs if term in inv_doc])
48
+ weights[term] = float(freq) * math.log10(float(len(inverse_docs)) / denominator)
49
+ return weights
50
+
51
+
52
+ # Apply NLTK's SnowballStemmer
53
+ def snowball_stemmer(words):
54
+ stemmer = nltk.SnowballStemmer('english')
55
+ stems = [stemmer.stem(word) for word in words]
56
+ return stems
57
+
58
+
59
+ def save_feature(filename, X):
60
+ with open(filename, 'wb') as fout:
61
+ pickle.dump(X, fout, protocol=pickle.HIGHEST_PROTOCOL)
62
+
63
+
64
+ def main():
65
+ lowercased_sentences = [remove_diacritics(sen).lower() for sen in sentences]
66
+ tokenized_sentences = [nltk.word_tokenize(sen) for sen in lowercased_sentences]
67
+ stopword_removed_sentences = [[word for word in sentence if word not in nltk.corpus.stopwords.words('english')] for
68
+ sentence in tokenized_sentences]
69
+ filtered_sentences = [filter_non_alphabeticals(sentence) for sentence in stopword_removed_sentences]
70
+
71
+ bigrams = []
72
+ for sentence in filtered_sentences:
73
+ expanded_flat_sentence = flatten_bigram_components(' '.join(sentence))
74
+ print("Expanded Flat Sentence:", expanded_flat_sentence)
75
+ bigrams.extend([(pair[0], pair[1]) for pair in nltk.bigrams(expanded_flat_sentence)])
76
+ # bigrams.extend([' '.join(pair) for pair in nltk.bigrams(expanded_flat_sentence)])
77
+
78
+ bigram_counter = Counter(bigrams)
79
+ print("\nTop five occurring bigrams:\n")
80
+ print(*get_top_biagrams(bigram_counter, 5), sep='\n')
81
+
82
+ # Perform additional filtering, typing conversion, and counting to compute frequencies for each sentence
83
+ flattened_words = sum(filtered_sentences, [])
84
+ filtered_unique_words = list(sorted(set(flattened_words)))
85
+
86
+ # Document frequency matrix computation
87
+ # Document frequency matrix computation
88
+ inverse_docs = []
89
+ for sen in filtered_sentences:
90
+ vec = np.zeros(len(filtered_unique_words))
91
+ for word in sen:
92
+ try:
93
+ pos = filtered_unique_words.index(word)
94
+ vec[pos] += 1
95
+ except ValueError:
96
+ pass
97
+ inverse_docs.append(vec)
98
+
99
+ # Term frequency - inverse document frequency (TF-IDF) calculation
100
+ term_freqs = []
101
+ for vec in inverse_docs:
102
+ tot = sum(vec)
103
+ tf = [freq / tot for freq in vec]
104
+ term_freqs.append(tf)
105
+
106
+ inv_docs = []
107
+ for sen in filtered_sentences:
108
+ doc_vec = np.zeros(len(filtered_unique_words))
109
+ for word in set(sen):
110
+ try:
111
+ pos = filtered_unique_words.index(word)
112
+ doc_vec[pos] += 1
113
+ except ValueError:
114
+ pass
115
+ inv_docs.append(doc_vec)
116
+
117
+ inv_docs_arr = np.array(inv_docs)
118
+
119
+ inv_doc_sum = np.zeros(len(filtered_unique_words))
120
+ for iv_doc in inv_docs_arr:
121
+ inv_doc_sum += iv_doc
122
+
123
+ # Term frequency - inverse document frequency (TF-IDF) calculation
124
+ tf_idfs = []
125
+ inv_doc_sum = np.zeros(len(filtered_unique_words))
126
+ for i, freqs in enumerate(term_freqs):
127
+ doc_vec = np.zeros(len(filtered_unique_words))
128
+ for pos in range(len(filtered_unique_words)):
129
+ inv_doc_sum[pos] += inv_docs_arr[i][pos]
130
+ div = np.sqrt(inv_docs_arr[i][pos] + inv_doc_sum[pos])
131
+ doc_vec[pos] = freqs[pos] / div if inv_docs_arr[i][pos] != 0 else 0
132
+ tf_idfs.append(doc_vec)
133
+
134
+ # Ensure the flattened_words has already been defined
135
+ # Compute term frequencies (without TF normalization)
136
+ term_freqs = dict((token, 1 + freq) for token, freq in Counter(flattened_words).items())
137
+
138
+ # Get IDF statistics
139
+ inverse_docs = [term_freqs.keys() for sen in filtered_sentences]
140
+ inverse_doc_freqs = Counter(chain.from_iterable(inverse_docs))
141
+ idfs = dict((token, np.log(len(inverse_docs) / (1 + inverse_doc_freqs[token]))) for token in term_freqs.keys())
142
+
143
+ # Determine final term frequency-inverse document frequencies
144
+ tf_idfs = [[idfs[token] * fre for token in sen if token in idfs] for fre in term_freqs.values()]
145
+
146
+ # Display a couple of samples for verification
147
+ print("Sample TF-IDF Features (before stacking):", tf_idfs[:3])
148
+
149
+ # Stack the TF-IDF features together
150
+ features = np.vstack(tf_idfs)
151
+
152
+ # Print the shape of the resulting features array
153
+ print("Features Array Shape:", features.shape)
154
+
155
+ # Save the features as a pickle file
156
+ save_feature('features.pickle', features)
157
+
158
+
159
+ main()
history.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f0487c2986ecb7b7f894f81252cbc9196724903a003ee5c8d444cd9a23aee63f
3
+ size 14007