Spaces:

kreem22
/

MistralAI

Runtime error

App Files Files Community

kreem22 commited on Apr 10

Commit

5042eab

•

1 Parent(s): cdac087

Upload 7 files

Browse files

Files changed (7) hide show

FileUtil.py +20 -0
MistralaiChat.py +124 -0
adamneveml/dataset.csv +4 -0
adamneveml/datasetcsv.py +69 -0
adamneveml/features.pickle +3 -0
adamneveml/purify.py +159 -0
history.bin +3 -0

FileUtil.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import pickle
+import os
+def write_to_file(data, filename):
+    with open(filename, 'wb') as file:
+        pickle.dump(data, file)
+def read_from_file(filename):
+    if not os.path.exists(filename):
+        open(filename, 'w').close()  # Create a blank file if it doesn't exist
+        return []
+    with open(filename, 'rb') as file:
+        try:
+            data = pickle.load(file)
+        except:
+            print("An exception occurred")
+            return []
+    return data

MistralaiChat.py ADDED Viewed

	@@ -0,0 +1,124 @@

+from huggingface_hub import InferenceClient
+import gradio as gr
+import random
+import FileUtil
+client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
+def format_prompt(message, history):
+    prompt = "<s>"
+    for user_prompt, bot_response in history:
+        prompt += f"[INST] {user_prompt} [/INST]"
+        prompt += f" {bot_response}</s> "
+    prompt += f"[INST] {message} [/INST]"
+    return prompt
+def generate(
+        prompt, history, system_prompt, temperature=0.9, max_new_tokens=256, top_p=0.95, repetition_penalty=1.0,
+):
+    hist_len = len(history)
+    if hist_len == 0:
+        history = FileUtil.read_from_file("history.bin")
+    temperature = float(temperature)
+    if temperature < 1e-2:
+        temperature = 1e-2
+    top_p = float(top_p)
+    generate_kwargs = dict(
+        temperature=temperature,
+        max_new_tokens=max_new_tokens,
+        top_p=top_p,
+        repetition_penalty=repetition_penalty,
+        do_sample=True,
+        seed=random.randint(1, 99999)
+        # seed=42,
+    )
+    print(" your prompt: "+prompt)
+    formatted_prompt = format_prompt(f"{system_prompt}, {prompt}", history)
+    stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True,
+                                    return_full_text=False)
+    output = ""
+    FileUtil.write_to_file(history, 'history.bin')
+    for response in stream:
+        output += response.token.text
+        yield output
+    return output
+additional_inputs = [
+    gr.Textbox(
+        label="System Prompt",
+        max_lines=1,
+        interactive=True,
+    ),
+    gr.Slider(
+        label="Temperature",
+        value=0.9,
+        minimum=0.0,
+        maximum=1.0,
+        step=0.05,
+        interactive=True,
+        info="Higher values produce more diverse outputs",
+    ),
+    gr.Slider(
+        label="Max new tokens",
+        value=1048,
+        minimum=0,
+        maximum=1048 * 10,
+        step=64,
+        interactive=True,
+        info="The maximum numbers of new tokens",
+    ),
+    gr.Slider(
+        label="Top-p (nucleus sampling)",
+        value=0.90,
+        minimum=0.0,
+        maximum=1,
+        step=0.05,
+        interactive=True,
+        info="Higher values sample more low-probability tokens",
+    ),
+    gr.Slider(
+        label="Repetition penalty",
+        value=1.2,
+        minimum=1.0,
+        maximum=2.0,
+        step=0.05,
+        interactive=True,
+        info="Penalize repeated tokens",
+    )
+]
+examples = [[
+                "I'm planning a vacation to Japan. Can you suggest a one-week itinerary including must-visit places and local cuisines to try?",
+                None, None, None, None, None, ],
+            ["Can you write a short story about a time-traveling detective who solves historical mysteries?", None,
+             None, None, None, None, ],
+            [
+                "I'm trying to learn French. Can you provide some common phrases that would be useful for a beginner, along with their pronunciations?",
+                None, None, None, None, None, ],
+            [
+                "I have chicken, rice, and bell peppers in my kitchen. Can you suggest an easy recipe I can make with these ingredients?",
+                None, None, None, None, None, ],
+            ["Can you explain how the QuickSort algorithm works and provide a Python implementation?", None, None, None,
+             None, None, ],
+            [
+                "What are some unique features of Rust that make it stand out compared to other systems programming languages like C++?",
+                None, None, None, None, None, ],
+            ]
+gr.ChatInterface(
+    fn=generate,
+    chatbot=gr.Chatbot(show_label=True, show_share_button=True, show_copy_button=True, likeable=True, layout="panel"),
+    additional_inputs=additional_inputs,
+    title="Chat Example",
+    examples=examples,
+    concurrency_limit=20,
+).launch(show_api=True, share=True)

adamneveml/dataset.csv ADDED Viewed

	@@ -0,0 +1,4 @@

+id,text,sentiment,length,language,source,gender,age_group,time_period,category,entities,question,answer
+1,Horrible pizza at this place.,-1,32,ENG,Twitter,Female,YoungAdult,2023-Q2,review,"['pizza', 'place']",Is this a bad restaurant review?,FALSE
+2,"The first manned mission landed on the moon on July 20th, 1969.",0,24,ENG,Article,Male,Adult,2021-Q1,science,"['apollo', 'moon', 'landing', 'July 20th, 1969']",Did humans land on Mars?,FALSE
+3,"Can machines think? Or more specifically, can robots drive cars?",1,25,ENG,Blog,Other,GenX,2015-Q4,qa,[],Can robots drive cars?,TRUE

adamneveml/datasetcsv.py ADDED Viewed

	@@ -0,0 +1,69 @@

+# create_toy_dataset.py
+from random import randrange
+import pandas as pd
+from datetime import date, timedelta
+def get_random_date():
+    """ Generate random time period dates."""
+    return date(2015, 1, 1) + timedelta(
+        seconds=int(round((date.today() - date(2015, 1, 1)).total_seconds() * randrange(50, 100))))
+# Define the dataset as a list of dictionaries
+toy_dataset = [
+    {
+        'id': 1,
+        'text': 'Horrible pizza at this place.',
+        'sentiment': -1,
+        'length': 32,
+        'language': 'ENG',
+        'source': 'Twitter',
+        'gender': 'Female',
+        'age_group': 'YoungAdult',
+        'time_period': get_random_date(),
+        'category': 'review',
+        'entities': ['pizza', 'place'],
+        'question': "Is this a bad restaurant review?",
+        'answer': False
+    },
+    {
+        'id': 2,
+        'text': "The first manned mission landed on the moon on July 20th, 1969.",
+        'sentiment': 0,
+        'length': 24,
+        'language': 'ENG',
+        'source': 'Article',
+        'gender': 'Male',
+        'age_group': 'Adult',
+        'time_period': '2021-Q1',
+        'category': 'science',
+        'entities': ['apollo', 'moon', 'landing', 'July 20th, 1969'],
+        'question': "Did humans land on Mars?",
+        'answer': False
+    },
+    {
+        'id': 3,
+        'text': "Can machines think? Or more specifically, can robots drive cars?",
+        'sentiment': 1,
+        'length': 25,
+        'language': 'ENG',
+        'source': 'Blog',
+        'gender': 'Other',
+        'age_group': 'GenX',
+        'time_period': '2015-Q4',
+        'category': 'qa',
+        'entities': [],
+        'question': "Can robots drive cars?",
+        'answer': True
+    }
+]
+# Convert the dataset list into a DataFrame
+df = pd.DataFrame(toy_dataset)
+# Write DataFrame contents into a CSV file
+output_file = 'dataset.csv'
+df.to_csv(output_file, index=False)
+print(f'Successfully created "{output_file}"!')

adamneveml/features.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b80cf203e6c8a28aac140f8335c021cc12a567cc9becd21ce8ce20d9f680e449
+size 849

adamneveml/purify.py ADDED Viewed

	@@ -0,0 +1,159 @@

+import csv
+import math
+import pickle
+from collections import Counter
+from itertools import chain
+import nltk
+import numpy as np
+import unicodedata
+nltk.download('stopwords')
+nltk.download('punkt')
+with open('dataset.csv', 'r', encoding='utf-8') as f:
+    input_reader = csv.DictReader(f)
+    sentences = [row['text'] for row in input_reader]
+# Remove diacritics from all charactrs
+def remove_diacritics(text):
+    text = unicodedata.normalize('NFKD', text) \
+        .encode('ascii', 'ignore') \
+        .decode('utf-8', 'ignore')
+    return text
+def filter_non_alphabeticals(words):
+    return [w for w in words if w and w.isalpha()]
+# Lowercase and concatenate words for bigram extraction
+def flatten_bigram_components(text):
+    return ' '.join([word.lower().strip() for word in nltk.word_tokenize(text)]).split()
+# Create bigram frequency distribution
+def get_top_biagrams(counter, num=5):
+    bigrams = [(k[0] + ' ' + k[1], v) for k, v in counter.items()]
+    sorted_biagrams = sorted(bigrams, reverse=True, key=(lambda kv: kv[1]))[:num]
+    return sorted_biagrams
+# Calculate TF-IDF weightings for term importance
+def tf_idf(term_freqs, inverse_docs):
+    weights = {}
+    for term, freq in term_freqs.items():
+        denominator = sum([inv_doc[term] for inv_doc in inverse_docs if term in inv_doc])
+        weights[term] = float(freq) * math.log10(float(len(inverse_docs)) / denominator)
+    return weights
+# Apply NLTK's SnowballStemmer
+def snowball_stemmer(words):
+    stemmer = nltk.SnowballStemmer('english')
+    stems = [stemmer.stem(word) for word in words]
+    return stems
+def save_feature(filename, X):
+    with open(filename, 'wb') as fout:
+        pickle.dump(X, fout, protocol=pickle.HIGHEST_PROTOCOL)
+def main():
+    lowercased_sentences = [remove_diacritics(sen).lower() for sen in sentences]
+    tokenized_sentences = [nltk.word_tokenize(sen) for sen in lowercased_sentences]
+    stopword_removed_sentences = [[word for word in sentence if word not in nltk.corpus.stopwords.words('english')] for
+                                  sentence in tokenized_sentences]
+    filtered_sentences = [filter_non_alphabeticals(sentence) for sentence in stopword_removed_sentences]
+    bigrams = []
+    for sentence in filtered_sentences:
+        expanded_flat_sentence = flatten_bigram_components(' '.join(sentence))
+        print("Expanded Flat Sentence:", expanded_flat_sentence)
+        bigrams.extend([(pair[0], pair[1]) for pair in nltk.bigrams(expanded_flat_sentence)])
+        # bigrams.extend([' '.join(pair) for pair in nltk.bigrams(expanded_flat_sentence)])
+    bigram_counter = Counter(bigrams)
+    print("\nTop five occurring bigrams:\n")
+    print(*get_top_biagrams(bigram_counter, 5), sep='\n')
+    # Perform additional filtering, typing conversion, and counting to compute frequencies for each sentence
+    flattened_words = sum(filtered_sentences, [])
+    filtered_unique_words = list(sorted(set(flattened_words)))
+    # Document frequency matrix computation
+    # Document frequency matrix computation
+    inverse_docs = []
+    for sen in filtered_sentences:
+        vec = np.zeros(len(filtered_unique_words))
+        for word in sen:
+            try:
+                pos = filtered_unique_words.index(word)
+                vec[pos] += 1
+            except ValueError:
+                pass
+        inverse_docs.append(vec)
+    # Term frequency - inverse document frequency (TF-IDF) calculation
+    term_freqs = []
+    for vec in inverse_docs:
+        tot = sum(vec)
+        tf = [freq / tot for freq in vec]
+        term_freqs.append(tf)
+    inv_docs = []
+    for sen in filtered_sentences:
+        doc_vec = np.zeros(len(filtered_unique_words))
+        for word in set(sen):
+            try:
+                pos = filtered_unique_words.index(word)
+                doc_vec[pos] += 1
+            except ValueError:
+                pass
+        inv_docs.append(doc_vec)
+    inv_docs_arr = np.array(inv_docs)
+    inv_doc_sum = np.zeros(len(filtered_unique_words))
+    for iv_doc in inv_docs_arr:
+        inv_doc_sum += iv_doc
+    # Term frequency - inverse document frequency (TF-IDF) calculation
+    tf_idfs = []
+    inv_doc_sum = np.zeros(len(filtered_unique_words))
+    for i, freqs in enumerate(term_freqs):
+        doc_vec = np.zeros(len(filtered_unique_words))
+        for pos in range(len(filtered_unique_words)):
+            inv_doc_sum[pos] += inv_docs_arr[i][pos]
+            div = np.sqrt(inv_docs_arr[i][pos] + inv_doc_sum[pos])
+            doc_vec[pos] = freqs[pos] / div if inv_docs_arr[i][pos] != 0 else 0
+        tf_idfs.append(doc_vec)
+    # Ensure the flattened_words has already been defined
+    # Compute term frequencies (without TF normalization)
+    term_freqs = dict((token, 1 + freq) for token, freq in Counter(flattened_words).items())
+    # Get IDF statistics
+    inverse_docs = [term_freqs.keys() for sen in filtered_sentences]
+    inverse_doc_freqs = Counter(chain.from_iterable(inverse_docs))
+    idfs = dict((token, np.log(len(inverse_docs) / (1 + inverse_doc_freqs[token]))) for token in term_freqs.keys())
+    # Determine final term frequency-inverse document frequencies
+    tf_idfs = [[idfs[token] * fre for token in sen if token in idfs] for fre in term_freqs.values()]
+    # Display a couple of samples for verification
+    print("Sample TF-IDF Features (before stacking):", tf_idfs[:3])
+    # Stack the TF-IDF features together
+    features = np.vstack(tf_idfs)
+    # Print the shape of the resulting features array
+    print("Features Array Shape:", features.shape)
+    # Save the features as a pickle file
+    save_feature('features.pickle', features)
+main()

history.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f0487c2986ecb7b7f894f81252cbc9196724903a003ee5c8d444cd9a23aee63f
+size 14007