Spaces:
Runtime error
Runtime error
| import streamlit as st | |
| import requests | |
| enable_xorbits = True | |
| if enable_xorbits: | |
| import xorbits | |
| xorbits.init() | |
| import xorbits.pandas as pd | |
| else: | |
| import pandas as pd | |
| st.set_page_config(page_title="Analyzing Text Corpus on Hugging Face", page_icon=":bar_chart:", layout="wide") | |
| st.sidebar.title('A Tool for Analyzing Text Corpus on Hugging Face') | |
| st.sidebar.markdown( | |
| ''' | |
| This tool retrieves parquet files from Hugging Face, identifies and quantifies | |
| junk data, duplication, contamination, and biased content in dataset using Pandas Dataframe, | |
| and accelerates time-consuming processes using Xorbits. | |
| ''' | |
| ) | |
| st.sidebar.header("Please Paste The HF Dataset Name Here:") | |
| #@st.cache_data | |
| def load_dataset(j, name, fraction): | |
| import os | |
| if not os.path.exists('%s-train.gzip' % name): | |
| with st.spinner('Downloading file from remote server'): | |
| import pandas | |
| train_urls = [f['url'] for f in j['parquet_files'] if f['config'] == name and f['split'] == 'train'] | |
| train_dataset = pandas.concat([pandas.read_parquet(url, engine='pyarrow') for url in train_urls], ignore_index=True) | |
| train_dataset.to_parquet('%s-train.gzip' % name) | |
| if not os.path.exists('%s-test.gzip' % name): | |
| with st.spinner('Downloading file from remote server'): | |
| import pandas | |
| test_urls = [f['url'] for f in j['parquet_files'] if f['config'] == name and f['split'] == 'validation'] | |
| test_dataset = pandas.concat([pandas.read_parquet(url, engine='pyarrow') for url in test_urls], ignore_index=True) | |
| test_dataset.to_parquet('%s-test.gzip' % name) | |
| train_dataset = pd.read_parquet('%s-train.gzip' % name, engine='pyarrow') | |
| test_dataset = pd.read_parquet('%s-test.gzip' % name, engine='pyarrow') | |
| if enable_xorbits: | |
| train_dataset.rebalance() | |
| test_dataset.rebalance() | |
| dataset = { | |
| "train": train_dataset.sample(frac=fraction), | |
| "test": test_dataset.sample(frac=fraction), | |
| } | |
| return dataset | |
| def get_hugging_face_dataset(name): | |
| r = requests.get("https://datasets-server.huggingface.co/parquet?dataset=" + dataset_name) | |
| return r.json() | |
| dataset_name = st.sidebar.text_input('Dataset Name', 'blog_authorship_corpus') | |
| with st.spinner('Loading meta'): | |
| hf_datasets = get_hugging_face_dataset(dataset_name) | |
| subsets = set([x['config'] for x in hf_datasets['parquet_files']]) | |
| subset_option = st.sidebar.selectbox("Choose a subset", subsets) | |
| sample_rate_option = st.sidebar.slider('Select sample rate', value=0.01, min_value=0.1, max_value=1.0, step=0.1) | |
| tab0, tab1, tab2, tab3, tab4, tab5 = st.tabs( | |
| ["Introduction", "Junk Data🤖", "Biased Content🛡️", "Short Documents🌐", "Contamination🧹", "Duplication🔍"]) | |
| with tab0: | |
| st.markdown( | |
| ''' | |
| ### Why this matters? | |
| LLMs are trained on immense datasets to have a broader understanding of language and improve | |
| their performance. | |
| However, the quality of the datasets can affect the performance and biases of the models. | |
| Large datasets often have quality issues, so practitioners need to clean and preprocess | |
| the data to remove biases, noise, and toxicity. | |
| This tool illustrates how to analyze and quantify the quality | |
| of any text corpus on [Hugging Face](https://huggingface.co/blog/hub-duckdb) using pandas. | |
| ### Data Preparation | |
| #### 1.Retrieving parquet files from Hugging Face Dataset Server | |
| First you can get the list of the Parquet files URLs with a simple HTTP call. | |
| ```python | |
| r = requests.get("https://datasets-server.huggingface.co/parquet?dataset=blog_authorship_corpus") | |
| j = r.json() | |
| urls = [f['url'] for f in j['parquet_files'] if f['split'] == 'train'] | |
| urls | |
| ['https://huggingface.co/datasets/blog_authorship_corpus/resolve/refs%2Fconvert%2Fparquet/blog_authorship_corpus/blog_authorship_corpus-train-00000-of-00002.parquet', | |
| 'https://huggingface.co/datasets/blog_authorship_corpus/resolve/refs%2Fconvert%2Fparquet/blog_authorship_corpus/blog_authorship_corpus-train-00001-of-00002.parquet'] | |
| ``` | |
| #### 2.Read URLs into Pandas Dataframe | |
| Use the pandas library to read multiple Parquet files from a list of URLs and concatenate | |
| them into a single DataFrame: | |
| ```python | |
| import pandas as pd | |
| parts = pd.read_parquet(url) for url in urls] | |
| df = pd.concat(parts, ignore_index=True) | |
| ``` | |
| #### 3.Addressing out-of-memory & performance issues | |
| Since the pandas library makes use of in-memory data structures to store and operate on data, | |
| which means that if the dataset your read from hugging face is too large to fit in memory, | |
| it will cause an error on pandas. So we use [Xorbits](https://xorbits.io) for dealing with | |
| larger datasets and use my laptop's cpu more efficiently. | |
| The use of Xorbits is as simple as: | |
| ```python | |
| import xorbits.pandas as pd | |
| import xorbits.numpy as np | |
| ``` | |
| --- | |
| ''' | |
| ) | |
| with st.expander("View raw data"): | |
| with st.spinner("Loading..."): | |
| datasets = load_dataset(hf_datasets, subset_option, sample_rate_option) | |
| train, test = st.tabs([ | |
| "Train (%d rows)" % len(datasets['train']), | |
| "Test (%d rows)" % len(datasets['test']) | |
| ]) | |
| train.dataframe(datasets['train'][:20]) | |
| test.dataframe(datasets['test'][:20]) | |
| with tab1: | |
| st.header("Junk Data") | |
| st.markdown(''' | |
| Large-scale datasets often contain an uneven distribution of text representation, which includes | |
| a significant amount of nonsensical and boilerplate text - such as HTML tags. | |
| The presence of such "noise" or irrelevant content in the dataset is detrimental to the | |
| training of predictive models, specifically those that operate by predicting the next token based on all previous ones. | |
| Therefore, it's crucial to clean the dataset and remove these undesired elements prior to the training phase. | |
| This piece of Python code calculated a measure of "impurity" in text documents, and then computing | |
| the proportion of documents that exceed a certain impurity threshold. It defines a compiled regular expression that matches | |
| any of the following suspicious characters: `&, #, <, >, {, }, [, ]`. | |
| ''') | |
| metrics, code = st.tabs(['Metrics', 'Code']) | |
| with metrics: | |
| with st.spinner('Calculating impurity ratio...'): | |
| df = datasets['train'] | |
| import re | |
| RE_SUSPICIOUS = re.compile(r'[&#<>{}\[\]\\]') | |
| def impurity(text, min_len=10): | |
| """returns the share of suspicious characters in a text""" | |
| if text == None or len(text) < min_len: | |
| return 0 | |
| else: | |
| return len(RE_SUSPICIOUS.findall(text))/len(text) | |
| df['impurity'] = df['text'].apply(impurity, min_len=10) | |
| total_num_docs = len(df) | |
| impurity_num_docs = len(df[df['impurity'] > 0.01]) | |
| impurity_ratio = impurity_num_docs / total_num_docs | |
| col1, col2, col3 = st.columns(3) | |
| col1.metric(label="Junk Doc Count", value="%d" % impurity_num_docs) | |
| col2.metric(label="Total Doc Count", value="%d" % total_num_docs) | |
| col3.metric(label="Junk Doc Ratio", value="%.2f%%" % (impurity_ratio * 100)) | |
| st.dataframe(df[['text', 'impurity']].sort_values(by='impurity', ascending=False)[:20]) | |
| with code: | |
| st.code( | |
| ''' | |
| import re | |
| RE_SUSPICIOUS = re.compile(r'[&#<>{}\[\]\\]') | |
| def impurity(text, min_len=10): | |
| """returns the share of suspicious characters in a text""" | |
| if text == None or len(text) < min_len: | |
| return 0 | |
| else: | |
| return len(RE_SUSPICIOUS.findall(text))/len(text) | |
| df['impurity'] = df['text'].apply(impurity, min_len=10) | |
| total_num_docs = len(df) | |
| impurity_num_docs = len(df[df['impurity'] > 0.001]) | |
| impurity_ratio = impurity_num_docs / total_num_docs | |
| ''' | |
| ) | |
| with tab2: | |
| st.header('Toxic Content') | |
| st.markdown(''' | |
| It is crucial in the training of language models to be vigilant and potentially apply tools | |
| to exclude toxic content from the pre-training datasets. This practice helps to | |
| prevent the models from demonstrating bias or generating detrimental content in subsequent applications. | |
| One approach to address this issue is by scanning the text for **offensive words**. | |
| For instance, the creators of the C4 dataset have implemented such a | |
| filtering mechanism. The follow code references this | |
| [word ](https://github.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/blob/master/en) that they open source. | |
| The following code utilizes the word list to quantify the "biased content ratio" in the dataset. | |
| ''') | |
| metrics, code = st.tabs(['Metrics', 'Code']) | |
| with metrics: | |
| with st.spinner('Calculating toxic ratio...'): | |
| df = datasets['train'] | |
| with open('./List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words', 'r') as f: | |
| lines = f.readlines() | |
| banned_words = [line.rstrip('\n') for line in lines] | |
| df['banned_words_in_text'] = df['text'].apply(lambda text: [word for word in banned_words if word in text.lower().split()]) | |
| df['matches'] = df['banned_words_in_text'].apply(lambda words: len(words) > 0) | |
| total_num_docs = len(df) | |
| biased_num_docs = df['matches'].sum() | |
| biased_content_ratio = biased_num_docs / total_num_docs | |
| col1, col2, col3 = st.columns(3) | |
| col1.metric(label="Total Doc Count", value="%d" % total_num_docs) | |
| col2.metric(label="Biased Doc Count", value="%d" % biased_num_docs) | |
| col3.metric(label="Biased Ratio", value="%.2f%%" % (biased_content_ratio * 100)) | |
| st.dataframe(df[df['matches']][['text', 'banned_words_in_text']][:20]) | |
| with code: | |
| st.code( | |
| ''' | |
| with open('./List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words', 'r') as f: | |
| lines = f.readlines() | |
| banned_words = [line.rstrip('\n') for line in lines] | |
| df['banned_words_in_text'] = df['text'].apply(lambda text: [word for word in banned_words if word in text.lower().split()]) | |
| total_num_docs = len(df) | |
| df['matches'] = df['banned_words_in_text'].apply(lambda words: len(words) > 0) | |
| biased_num_docs = df['matches'].sum() | |
| biased_content_ratio = biased_num_docs / total_num_docs | |
| ''' | |
| ) | |
| with tab3: | |
| st.header("Too-Short Documents") | |
| st.markdown(''' | |
| The aim of language modeling is to master the generation of text based on preceding tokens. | |
| In this scenario, eliminating extremely brief documents (text consisting of fewer than approximately | |
| 100 tokens) from the corpus could aid in the reduction of noise, by producing contiguous text to | |
| model dependencies within the text. | |
| Use the Hugging Face Transformers library to tokenize text and then calculate the proportion | |
| of documents that are "too short" in a dataset. This example converts text into tokens that the BERT | |
| model can understand. Choose a tokenizer for your model. | |
| ''') | |
| metrics, code = st.tabs(['Metrics', 'Code']) | |
| with metrics: | |
| with st.spinner('Calculating too-short ratio...'): | |
| from transformers import BertTokenizer | |
| tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') | |
| df = datasets['train'] | |
| # Create a new column with the number of tokens for each text | |
| df['text_length'] = df['text'].apply(lambda text: len(tokenizer.tokenize(text))) | |
| total_num_docs = len(df) | |
| too_short_docs = len(df[df['text_length'] < 100]) | |
| too_short_doc_ratio = too_short_docs / total_num_docs | |
| col1, col2, col3 = st.columns(3) | |
| col1.metric(label="Too-Short Doc Count", value="%d" % too_short_docs) | |
| col2.metric(label="Total Doc Count", value="%d" % total_num_docs) | |
| col3.metric(label="Too Short Doc Ratio", value="%.2f%%" % (too_short_doc_ratio * 100)) | |
| # col1, _ = st.columns([2, 1]) | |
| # import seaborn as sns | |
| # import matplotlib.pyplot as plt | |
| # fig, ax = plt.subplots(figsize=(10, 5)) | |
| # ax.set_title('Distribution of text length (in tokens)') | |
| # sns.histplot(data=df, x='text_length', ax=ax) | |
| # plt.axvline(100, color='r', linestyle='--') | |
| # col1.pyplot(fig) | |
| with code: | |
| st.code( | |
| ''' | |
| from transformers import BertTokenizer | |
| tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') | |
| df = datasets['train'] | |
| # Create a new column with the number of tokens for each text | |
| df['text_length'] = df['text'].apply(lambda text: len(tokenizer.tokenize(text))) | |
| total_num_docs = len(df) | |
| too_short_docs = len(df[df['text_length'] < 100]) | |
| too_short_doc_ratio = too_short_docs / total_num_docs | |
| ''' | |
| ) | |
| with tab4: | |
| st.header('Contamination') | |
| st.markdown(''' | |
| Typically, ensuring the segregation of training and testing data is rather straightforward in machine learning. | |
| However, things become complicated in the context of large language models | |
| where both the training and benchmarking datasets are collected from the internet. | |
| For instance, the performance evaluation of a large language model using benchmark data | |
| (like question-answer pairs) can be significantly affected if the benchmark data also features | |
| in the model's training set. The procedure of eliminating instances from the training datasets that intersect with | |
| the existing benchmarking datasets is called "decontamination". | |
| This Python code below is being used to quantify the contamination problem lying in the datasets, | |
| i.e., the proportion of documents in the test set that also appear in the training set using N-grams. | |
| The approach here is from GPT-3 paper. OpenAI defined a test document as contaminated | |
| if any N-gram overlap existed with any training document. | |
| (They used a range of N values between 8 and 13 depending on dataset.) | |
| When constructing the WebText dataset, OpenAI researchers decontaminated the data by | |
| eliminating all Wikipedia content from the training set. This was necessary as Wikipedia | |
| data was heavily used in their benchmark datasets. | |
| ''') | |
| metrics, code = st.tabs(['Metrics', 'Code']) | |
| with metrics: | |
| with st.spinner('Calculating contamination ratio...'): | |
| train_dataset = datasets['train'] | |
| test_dataset = datasets['test'] | |
| from nltk import ngrams | |
| from datasketch import MinHash, MinHashLSH | |
| def process_data(df): | |
| minhashes = {} | |
| for idx, text in enumerate(df['text']): | |
| minhash = MinHash(num_perm=128) | |
| for d in ngrams(text, 13): | |
| s = "".join(d).encode('utf-8') | |
| minhash.update(s) | |
| minhashes[idx] = minhash | |
| return minhashes | |
| train_minhashes = process_data(train_dataset) | |
| test_minhashes = process_data(test_dataset) | |
| lsh = MinHashLSH(threshold=0.8, num_perm=128) | |
| for idx, minhash in train_minhashes.items(): | |
| lsh.insert(idx, minhash) | |
| duplicates_count = 0 | |
| for idx, minhash in test_minhashes.items(): | |
| result = lsh.query(minhash) | |
| if len(result) > 0: | |
| duplicates_count += 1 | |
| train_dataset_count = len(train_dataset) | |
| test_dataset_count = len(test_dataset) | |
| contaminate_ratio = duplicates_count / test_dataset_count | |
| col1, col2, col3, col4 = st.columns(4) | |
| col1.metric(label="Train Set Size", value="%d" % train_dataset_count) | |
| col2.metric(label="Test Set Size", value="%d" % test_dataset_count) | |
| col3.metric(label="Overlapped Docs", value="%d" % duplicates_count) | |
| col4.metric(label="Contaminated Ratio", value="%.2f%%" % (contaminate_ratio * 100)) | |
| with code: | |
| st.code( | |
| ''' | |
| from nltk import ngrams | |
| from datasketch import MinHash, MinHashLSH | |
| def process_data(df): | |
| minhashes = {} | |
| for idx, r in df.iterrows(): | |
| minhash = MinHash(num_perm=128) | |
| for d in ngrams(r['text'], 13): | |
| s = "".join(d).encode('utf-8') | |
| minhash.update(s) | |
| minhashes[idx] = minhash | |
| return minhashes | |
| train_minhashes = process_data(train_dataset) | |
| test_minhashes = process_data(test_dataset) | |
| lsh = MinHashLSH(threshold=0.8, num_perm=128) | |
| for idx, minhash in train_minhashes.items(): | |
| lsh.insert(idx, minhash) | |
| duplicates_count = 0 | |
| for idx, minhash in test_minhashes.items(): | |
| result = lsh.query(minhash) | |
| if len(result) > 0: | |
| duplicates_count += 1 | |
| train_dataset_count = len(train_dataset) | |
| test_dataset_count = len(test_dataset) | |
| contaminate_ratio = duplicates_count / test_dataset_count | |
| ''' | |
| ) | |
| with tab5: | |
| st.header("Duplication") | |
| st.markdown( | |
| ''' | |
| When datasets are created by scraping raw text from the Internet, this will often result | |
| in the same sequences being repeated multiple times. [This paper](https://arxiv.org/abs/2107.06499) mentions a single 50 word sequence that is | |
| repeated in the C4 dataset 60,000 times. | |
| Deduplication helps prevent models from outputting verbatim training data when | |
| there are many duplicates, and makes models less vulnerable to privacy attacks. | |
| Deduplication can also improve model training efficiency and prevent benchmark contamination. | |
| ### Tools & Tutorials | |
| The [GPT-3](https://arxiv.org/abs/2005.14165) paper mentions they fuzzily deduplicated documents | |
| within each dataset using Spark’s MinHashLSH implementation with 10 hashes. | |
| [deduplicate-text-datasets](https://github.com/google-research/deduplicate-text-datasets) | |
| is an ExactSubstr deduplication implementation (written in Rust) along with the scripts to | |
| perform ExactSubstr deduplication and inspect the results (written in Python). | |
| [datasketch](https://github.com/ekzhu/datasketch) gives you probabilistic data structures that | |
| can process and search very large amount of data super fast, with little loss of accuracy. | |
| [This article](https://huggingface.co/blog/dedup) provides a MinHash walkthrough to demonstrate | |
| how to implement a parallelel deduplication. | |
| The following code uses the [datasketch](https://github.com/ekzhu/datasketch) library and LSH (Locality Sensitive Hashing) | |
| to deduplicate the dataset. For each text in the DataFrame, it creates a query MinHash object | |
| and performs a query on the LSH index to find similar documents. | |
| It worths to mention that the de-duplication process usually requires a lot of computational resources | |
| (CPU and RAM) due to the size of web crawl datasets and it's therefore recommended to run such | |
| computations in distributed settings. | |
| ''' | |
| ) | |
| metrics, code = st.tabs(['Metrics', 'Code']) | |
| with metrics: | |
| with st.spinner('Calculating duplication ratio...'): | |
| df = datasets['train'] | |
| from datasketch import MinHashLSH, MinHash | |
| lsh = MinHashLSH(threshold=0.85, num_perm=128) | |
| for i, text in enumerate(df['text']): | |
| minhash = MinHash(num_perm=128) | |
| for word in text.split(): | |
| minhash.update(word.encode('utf-8')) | |
| lsh.insert(str(i), minhash) | |
| unique_documents = set() | |
| for i, text in enumerate(df['text']): | |
| query_minhash = MinHash(num_perm=128) | |
| for word in text.split(): | |
| query_minhash.update(word.encode('utf-8')) | |
| results = lsh.query(query_minhash) | |
| unique_documents.add(results[0]) | |
| total_unique_documents = len(unique_documents) | |
| total_documents = len(df) | |
| duplication_ratio = (total_documents - total_unique_documents) / total_documents | |
| col1, col2, col3 = st.columns(3) | |
| col2.metric(label="Total Documents", value="%d" % total_documents) | |
| col1.metric(label="Unique Docs Pairs", value="%d" % total_unique_documents) | |
| col3.metric(label="Duplication Ratio", value="%.2f%%" % (duplication_ratio * 100)) | |
| with code: | |
| st.code( | |
| ''' | |
| from datasketch import MinHashLSH, MinHash | |
| lsh = MinHashLSH(threshold=0.85, num_perm=128) | |
| for i, text in enumerate(df['text']): | |
| minhash = MinHash(num_perm=128) | |
| for word in text.split(): | |
| minhash.update(word.encode('utf-8')) | |
| lsh.insert(str(i), minhash) | |
| unique_documents = set() | |
| for i, text in enumerate(df['text']): | |
| query_minhash = MinHash(num_perm=128) | |
| for word in text.split(): | |
| query_minhash.update(word.encode('utf-8')) | |
| results = lsh.query(query_minhash) | |
| unique_documents.add(results[0]) | |
| total_unique_documents = len(unique_documents) | |
| total_documents = len(df) | |
| duplication_ratio = (total_documents - total_unique_documents) / total_documents | |
| ''' | |
| ) |