Upload 7 files
Browse files- FileUtil.py +20 -0
- MistralaiChat.py +124 -0
- adamneveml/dataset.csv +4 -0
- adamneveml/datasetcsv.py +69 -0
- adamneveml/features.pickle +3 -0
- adamneveml/purify.py +159 -0
- history.bin +3 -0
FileUtil.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pickle
|
2 |
+
import os
|
3 |
+
|
4 |
+
|
5 |
+
def write_to_file(data, filename):
|
6 |
+
with open(filename, 'wb') as file:
|
7 |
+
pickle.dump(data, file)
|
8 |
+
|
9 |
+
|
10 |
+
def read_from_file(filename):
|
11 |
+
if not os.path.exists(filename):
|
12 |
+
open(filename, 'w').close() # Create a blank file if it doesn't exist
|
13 |
+
return []
|
14 |
+
with open(filename, 'rb') as file:
|
15 |
+
try:
|
16 |
+
data = pickle.load(file)
|
17 |
+
except:
|
18 |
+
print("An exception occurred")
|
19 |
+
return []
|
20 |
+
return data
|
MistralaiChat.py
ADDED
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from huggingface_hub import InferenceClient
|
2 |
+
import gradio as gr
|
3 |
+
import random
|
4 |
+
import FileUtil
|
5 |
+
|
6 |
+
client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
|
7 |
+
|
8 |
+
|
9 |
+
def format_prompt(message, history):
|
10 |
+
prompt = "<s>"
|
11 |
+
for user_prompt, bot_response in history:
|
12 |
+
prompt += f"[INST] {user_prompt} [/INST]"
|
13 |
+
prompt += f" {bot_response}</s> "
|
14 |
+
prompt += f"[INST] {message} [/INST]"
|
15 |
+
|
16 |
+
return prompt
|
17 |
+
|
18 |
+
|
19 |
+
def generate(
|
20 |
+
prompt, history, system_prompt, temperature=0.9, max_new_tokens=256, top_p=0.95, repetition_penalty=1.0,
|
21 |
+
):
|
22 |
+
|
23 |
+
hist_len = len(history)
|
24 |
+
if hist_len == 0:
|
25 |
+
history = FileUtil.read_from_file("history.bin")
|
26 |
+
|
27 |
+
temperature = float(temperature)
|
28 |
+
if temperature < 1e-2:
|
29 |
+
temperature = 1e-2
|
30 |
+
top_p = float(top_p)
|
31 |
+
|
32 |
+
generate_kwargs = dict(
|
33 |
+
temperature=temperature,
|
34 |
+
max_new_tokens=max_new_tokens,
|
35 |
+
top_p=top_p,
|
36 |
+
repetition_penalty=repetition_penalty,
|
37 |
+
do_sample=True,
|
38 |
+
seed=random.randint(1, 99999)
|
39 |
+
# seed=42,
|
40 |
+
)
|
41 |
+
print(" your prompt: "+prompt)
|
42 |
+
formatted_prompt = format_prompt(f"{system_prompt}, {prompt}", history)
|
43 |
+
stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True,
|
44 |
+
return_full_text=False)
|
45 |
+
output = ""
|
46 |
+
|
47 |
+
FileUtil.write_to_file(history, 'history.bin')
|
48 |
+
|
49 |
+
for response in stream:
|
50 |
+
output += response.token.text
|
51 |
+
yield output
|
52 |
+
return output
|
53 |
+
|
54 |
+
|
55 |
+
additional_inputs = [
|
56 |
+
gr.Textbox(
|
57 |
+
label="System Prompt",
|
58 |
+
max_lines=1,
|
59 |
+
interactive=True,
|
60 |
+
),
|
61 |
+
gr.Slider(
|
62 |
+
label="Temperature",
|
63 |
+
value=0.9,
|
64 |
+
minimum=0.0,
|
65 |
+
maximum=1.0,
|
66 |
+
step=0.05,
|
67 |
+
interactive=True,
|
68 |
+
info="Higher values produce more diverse outputs",
|
69 |
+
),
|
70 |
+
gr.Slider(
|
71 |
+
label="Max new tokens",
|
72 |
+
value=1048,
|
73 |
+
minimum=0,
|
74 |
+
maximum=1048 * 10,
|
75 |
+
step=64,
|
76 |
+
interactive=True,
|
77 |
+
info="The maximum numbers of new tokens",
|
78 |
+
),
|
79 |
+
gr.Slider(
|
80 |
+
label="Top-p (nucleus sampling)",
|
81 |
+
value=0.90,
|
82 |
+
minimum=0.0,
|
83 |
+
maximum=1,
|
84 |
+
step=0.05,
|
85 |
+
interactive=True,
|
86 |
+
info="Higher values sample more low-probability tokens",
|
87 |
+
),
|
88 |
+
gr.Slider(
|
89 |
+
label="Repetition penalty",
|
90 |
+
value=1.2,
|
91 |
+
minimum=1.0,
|
92 |
+
maximum=2.0,
|
93 |
+
step=0.05,
|
94 |
+
interactive=True,
|
95 |
+
info="Penalize repeated tokens",
|
96 |
+
)
|
97 |
+
]
|
98 |
+
|
99 |
+
examples = [[
|
100 |
+
"I'm planning a vacation to Japan. Can you suggest a one-week itinerary including must-visit places and local cuisines to try?",
|
101 |
+
None, None, None, None, None, ],
|
102 |
+
["Can you write a short story about a time-traveling detective who solves historical mysteries?", None,
|
103 |
+
None, None, None, None, ],
|
104 |
+
[
|
105 |
+
"I'm trying to learn French. Can you provide some common phrases that would be useful for a beginner, along with their pronunciations?",
|
106 |
+
None, None, None, None, None, ],
|
107 |
+
[
|
108 |
+
"I have chicken, rice, and bell peppers in my kitchen. Can you suggest an easy recipe I can make with these ingredients?",
|
109 |
+
None, None, None, None, None, ],
|
110 |
+
["Can you explain how the QuickSort algorithm works and provide a Python implementation?", None, None, None,
|
111 |
+
None, None, ],
|
112 |
+
[
|
113 |
+
"What are some unique features of Rust that make it stand out compared to other systems programming languages like C++?",
|
114 |
+
None, None, None, None, None, ],
|
115 |
+
]
|
116 |
+
|
117 |
+
gr.ChatInterface(
|
118 |
+
fn=generate,
|
119 |
+
chatbot=gr.Chatbot(show_label=True, show_share_button=True, show_copy_button=True, likeable=True, layout="panel"),
|
120 |
+
additional_inputs=additional_inputs,
|
121 |
+
title="Chat Example",
|
122 |
+
examples=examples,
|
123 |
+
concurrency_limit=20,
|
124 |
+
).launch(show_api=True, share=True)
|
adamneveml/dataset.csv
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
id,text,sentiment,length,language,source,gender,age_group,time_period,category,entities,question,answer
|
2 |
+
1,Horrible pizza at this place.,-1,32,ENG,Twitter,Female,YoungAdult,2023-Q2,review,"['pizza', 'place']",Is this a bad restaurant review?,FALSE
|
3 |
+
2,"The first manned mission landed on the moon on July 20th, 1969.",0,24,ENG,Article,Male,Adult,2021-Q1,science,"['apollo', 'moon', 'landing', 'July 20th, 1969']",Did humans land on Mars?,FALSE
|
4 |
+
3,"Can machines think? Or more specifically, can robots drive cars?",1,25,ENG,Blog,Other,GenX,2015-Q4,qa,[],Can robots drive cars?,TRUE
|
adamneveml/datasetcsv.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# create_toy_dataset.py
|
2 |
+
from random import randrange
|
3 |
+
|
4 |
+
import pandas as pd
|
5 |
+
from datetime import date, timedelta
|
6 |
+
|
7 |
+
|
8 |
+
def get_random_date():
|
9 |
+
""" Generate random time period dates."""
|
10 |
+
return date(2015, 1, 1) + timedelta(
|
11 |
+
seconds=int(round((date.today() - date(2015, 1, 1)).total_seconds() * randrange(50, 100))))
|
12 |
+
|
13 |
+
|
14 |
+
# Define the dataset as a list of dictionaries
|
15 |
+
toy_dataset = [
|
16 |
+
{
|
17 |
+
'id': 1,
|
18 |
+
'text': 'Horrible pizza at this place.',
|
19 |
+
'sentiment': -1,
|
20 |
+
'length': 32,
|
21 |
+
'language': 'ENG',
|
22 |
+
'source': 'Twitter',
|
23 |
+
'gender': 'Female',
|
24 |
+
'age_group': 'YoungAdult',
|
25 |
+
'time_period': get_random_date(),
|
26 |
+
'category': 'review',
|
27 |
+
'entities': ['pizza', 'place'],
|
28 |
+
'question': "Is this a bad restaurant review?",
|
29 |
+
'answer': False
|
30 |
+
},
|
31 |
+
{
|
32 |
+
'id': 2,
|
33 |
+
'text': "The first manned mission landed on the moon on July 20th, 1969.",
|
34 |
+
'sentiment': 0,
|
35 |
+
'length': 24,
|
36 |
+
'language': 'ENG',
|
37 |
+
'source': 'Article',
|
38 |
+
'gender': 'Male',
|
39 |
+
'age_group': 'Adult',
|
40 |
+
'time_period': '2021-Q1',
|
41 |
+
'category': 'science',
|
42 |
+
'entities': ['apollo', 'moon', 'landing', 'July 20th, 1969'],
|
43 |
+
'question': "Did humans land on Mars?",
|
44 |
+
'answer': False
|
45 |
+
},
|
46 |
+
{
|
47 |
+
'id': 3,
|
48 |
+
'text': "Can machines think? Or more specifically, can robots drive cars?",
|
49 |
+
'sentiment': 1,
|
50 |
+
'length': 25,
|
51 |
+
'language': 'ENG',
|
52 |
+
'source': 'Blog',
|
53 |
+
'gender': 'Other',
|
54 |
+
'age_group': 'GenX',
|
55 |
+
'time_period': '2015-Q4',
|
56 |
+
'category': 'qa',
|
57 |
+
'entities': [],
|
58 |
+
'question': "Can robots drive cars?",
|
59 |
+
'answer': True
|
60 |
+
}
|
61 |
+
]
|
62 |
+
|
63 |
+
# Convert the dataset list into a DataFrame
|
64 |
+
df = pd.DataFrame(toy_dataset)
|
65 |
+
|
66 |
+
# Write DataFrame contents into a CSV file
|
67 |
+
output_file = 'dataset.csv'
|
68 |
+
df.to_csv(output_file, index=False)
|
69 |
+
print(f'Successfully created "{output_file}"!')
|
adamneveml/features.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b80cf203e6c8a28aac140f8335c021cc12a567cc9becd21ce8ce20d9f680e449
|
3 |
+
size 849
|
adamneveml/purify.py
ADDED
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import csv
|
2 |
+
import math
|
3 |
+
import pickle
|
4 |
+
from collections import Counter
|
5 |
+
from itertools import chain
|
6 |
+
|
7 |
+
import nltk
|
8 |
+
import numpy as np
|
9 |
+
import unicodedata
|
10 |
+
|
11 |
+
nltk.download('stopwords')
|
12 |
+
nltk.download('punkt')
|
13 |
+
|
14 |
+
with open('dataset.csv', 'r', encoding='utf-8') as f:
|
15 |
+
input_reader = csv.DictReader(f)
|
16 |
+
sentences = [row['text'] for row in input_reader]
|
17 |
+
|
18 |
+
|
19 |
+
# Remove diacritics from all charactrs
|
20 |
+
def remove_diacritics(text):
|
21 |
+
text = unicodedata.normalize('NFKD', text) \
|
22 |
+
.encode('ascii', 'ignore') \
|
23 |
+
.decode('utf-8', 'ignore')
|
24 |
+
return text
|
25 |
+
|
26 |
+
|
27 |
+
def filter_non_alphabeticals(words):
|
28 |
+
return [w for w in words if w and w.isalpha()]
|
29 |
+
|
30 |
+
|
31 |
+
# Lowercase and concatenate words for bigram extraction
|
32 |
+
def flatten_bigram_components(text):
|
33 |
+
return ' '.join([word.lower().strip() for word in nltk.word_tokenize(text)]).split()
|
34 |
+
|
35 |
+
|
36 |
+
# Create bigram frequency distribution
|
37 |
+
def get_top_biagrams(counter, num=5):
|
38 |
+
bigrams = [(k[0] + ' ' + k[1], v) for k, v in counter.items()]
|
39 |
+
sorted_biagrams = sorted(bigrams, reverse=True, key=(lambda kv: kv[1]))[:num]
|
40 |
+
return sorted_biagrams
|
41 |
+
|
42 |
+
|
43 |
+
# Calculate TF-IDF weightings for term importance
|
44 |
+
def tf_idf(term_freqs, inverse_docs):
|
45 |
+
weights = {}
|
46 |
+
for term, freq in term_freqs.items():
|
47 |
+
denominator = sum([inv_doc[term] for inv_doc in inverse_docs if term in inv_doc])
|
48 |
+
weights[term] = float(freq) * math.log10(float(len(inverse_docs)) / denominator)
|
49 |
+
return weights
|
50 |
+
|
51 |
+
|
52 |
+
# Apply NLTK's SnowballStemmer
|
53 |
+
def snowball_stemmer(words):
|
54 |
+
stemmer = nltk.SnowballStemmer('english')
|
55 |
+
stems = [stemmer.stem(word) for word in words]
|
56 |
+
return stems
|
57 |
+
|
58 |
+
|
59 |
+
def save_feature(filename, X):
|
60 |
+
with open(filename, 'wb') as fout:
|
61 |
+
pickle.dump(X, fout, protocol=pickle.HIGHEST_PROTOCOL)
|
62 |
+
|
63 |
+
|
64 |
+
def main():
|
65 |
+
lowercased_sentences = [remove_diacritics(sen).lower() for sen in sentences]
|
66 |
+
tokenized_sentences = [nltk.word_tokenize(sen) for sen in lowercased_sentences]
|
67 |
+
stopword_removed_sentences = [[word for word in sentence if word not in nltk.corpus.stopwords.words('english')] for
|
68 |
+
sentence in tokenized_sentences]
|
69 |
+
filtered_sentences = [filter_non_alphabeticals(sentence) for sentence in stopword_removed_sentences]
|
70 |
+
|
71 |
+
bigrams = []
|
72 |
+
for sentence in filtered_sentences:
|
73 |
+
expanded_flat_sentence = flatten_bigram_components(' '.join(sentence))
|
74 |
+
print("Expanded Flat Sentence:", expanded_flat_sentence)
|
75 |
+
bigrams.extend([(pair[0], pair[1]) for pair in nltk.bigrams(expanded_flat_sentence)])
|
76 |
+
# bigrams.extend([' '.join(pair) for pair in nltk.bigrams(expanded_flat_sentence)])
|
77 |
+
|
78 |
+
bigram_counter = Counter(bigrams)
|
79 |
+
print("\nTop five occurring bigrams:\n")
|
80 |
+
print(*get_top_biagrams(bigram_counter, 5), sep='\n')
|
81 |
+
|
82 |
+
# Perform additional filtering, typing conversion, and counting to compute frequencies for each sentence
|
83 |
+
flattened_words = sum(filtered_sentences, [])
|
84 |
+
filtered_unique_words = list(sorted(set(flattened_words)))
|
85 |
+
|
86 |
+
# Document frequency matrix computation
|
87 |
+
# Document frequency matrix computation
|
88 |
+
inverse_docs = []
|
89 |
+
for sen in filtered_sentences:
|
90 |
+
vec = np.zeros(len(filtered_unique_words))
|
91 |
+
for word in sen:
|
92 |
+
try:
|
93 |
+
pos = filtered_unique_words.index(word)
|
94 |
+
vec[pos] += 1
|
95 |
+
except ValueError:
|
96 |
+
pass
|
97 |
+
inverse_docs.append(vec)
|
98 |
+
|
99 |
+
# Term frequency - inverse document frequency (TF-IDF) calculation
|
100 |
+
term_freqs = []
|
101 |
+
for vec in inverse_docs:
|
102 |
+
tot = sum(vec)
|
103 |
+
tf = [freq / tot for freq in vec]
|
104 |
+
term_freqs.append(tf)
|
105 |
+
|
106 |
+
inv_docs = []
|
107 |
+
for sen in filtered_sentences:
|
108 |
+
doc_vec = np.zeros(len(filtered_unique_words))
|
109 |
+
for word in set(sen):
|
110 |
+
try:
|
111 |
+
pos = filtered_unique_words.index(word)
|
112 |
+
doc_vec[pos] += 1
|
113 |
+
except ValueError:
|
114 |
+
pass
|
115 |
+
inv_docs.append(doc_vec)
|
116 |
+
|
117 |
+
inv_docs_arr = np.array(inv_docs)
|
118 |
+
|
119 |
+
inv_doc_sum = np.zeros(len(filtered_unique_words))
|
120 |
+
for iv_doc in inv_docs_arr:
|
121 |
+
inv_doc_sum += iv_doc
|
122 |
+
|
123 |
+
# Term frequency - inverse document frequency (TF-IDF) calculation
|
124 |
+
tf_idfs = []
|
125 |
+
inv_doc_sum = np.zeros(len(filtered_unique_words))
|
126 |
+
for i, freqs in enumerate(term_freqs):
|
127 |
+
doc_vec = np.zeros(len(filtered_unique_words))
|
128 |
+
for pos in range(len(filtered_unique_words)):
|
129 |
+
inv_doc_sum[pos] += inv_docs_arr[i][pos]
|
130 |
+
div = np.sqrt(inv_docs_arr[i][pos] + inv_doc_sum[pos])
|
131 |
+
doc_vec[pos] = freqs[pos] / div if inv_docs_arr[i][pos] != 0 else 0
|
132 |
+
tf_idfs.append(doc_vec)
|
133 |
+
|
134 |
+
# Ensure the flattened_words has already been defined
|
135 |
+
# Compute term frequencies (without TF normalization)
|
136 |
+
term_freqs = dict((token, 1 + freq) for token, freq in Counter(flattened_words).items())
|
137 |
+
|
138 |
+
# Get IDF statistics
|
139 |
+
inverse_docs = [term_freqs.keys() for sen in filtered_sentences]
|
140 |
+
inverse_doc_freqs = Counter(chain.from_iterable(inverse_docs))
|
141 |
+
idfs = dict((token, np.log(len(inverse_docs) / (1 + inverse_doc_freqs[token]))) for token in term_freqs.keys())
|
142 |
+
|
143 |
+
# Determine final term frequency-inverse document frequencies
|
144 |
+
tf_idfs = [[idfs[token] * fre for token in sen if token in idfs] for fre in term_freqs.values()]
|
145 |
+
|
146 |
+
# Display a couple of samples for verification
|
147 |
+
print("Sample TF-IDF Features (before stacking):", tf_idfs[:3])
|
148 |
+
|
149 |
+
# Stack the TF-IDF features together
|
150 |
+
features = np.vstack(tf_idfs)
|
151 |
+
|
152 |
+
# Print the shape of the resulting features array
|
153 |
+
print("Features Array Shape:", features.shape)
|
154 |
+
|
155 |
+
# Save the features as a pickle file
|
156 |
+
save_feature('features.pickle', features)
|
157 |
+
|
158 |
+
|
159 |
+
main()
|
history.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f0487c2986ecb7b7f894f81252cbc9196724903a003ee5c8d444cd9a23aee63f
|
3 |
+
size 14007
|