import gradio as gr import datetime import json import requests from constants import * def process(query_type, corpus_desc, engine_desc, query, maxnum, request: gr.Request): timestamp = datetime.datetime.now().strftime('%Y%m%d-%H%M%S') corpus = CORPUS_BY_DESC[corpus_desc] engine = ENGINE_BY_DESC[engine_desc] data = { 'source': 'hf' if not DEBUG else 'hf-dev', 'timestamp': timestamp, 'query_type': query_type, 'corpus': corpus, 'engine': engine, 'query': query, } if maxnum is not None: data['maxnum'] = maxnum print(json.dumps(data)) if API_URL is None: raise ValueError(f'API_URL envvar is not set!') try: response = requests.post(API_URL, json=data, timeout=10) except requests.exceptions.Timeout: raise ValueError('Web request timed out. Please try again later.') except requests.exceptions.RequestException as e: raise ValueError(f'Web request error: {e}') if response.status_code == 200: result = response.json() else: raise ValueError(f'HTTP error {response.status_code}: {response.json()}') if DEBUG: print(result) return result def format_tokenization_info(result): if not ('token_ids' in result and 'tokens' in result): return '' token_ids = result['token_ids'] tokens = result['tokens'] if type(token_ids) == list and all([type(token_id) == int for token_id in token_ids]): output = '[' + " ".join(['"' + token.replace('Ġ', ' ') + '"' for token in tokens]) + '] ' + str(token_ids) else: ttt = [] for token_idss, tokenss in zip(token_ids, tokens): tt = [] for token_ids, tokens in zip(token_idss, tokenss): t = '[' + " ".join(['"' + token.replace('Ġ', ' ') + '"' for token in tokens]) + '] ' + str(token_ids) tt.append(t) tt = '\n'.join(tt) ttt.append(tt) output = '\n\n'.join(ttt) return output def format_doc(doc): formatted = [] if doc['doc_len'] == doc['disp_len']: header = f'[Document #{doc["doc_ix"]}, length = {doc["doc_len"]} tokens]\n\n' else: header = f'[Document #{doc["doc_ix"]}, length = {doc["doc_len"]} tokens ({doc["disp_len"]} tokens displayed)]\n\n' formatted.append((header, None)) formatted += doc['spans'] return formatted def count(corpus_desc, engine_desc, query, request: gr.Request): result = process('count', corpus_desc, engine_desc, query, None, request) latency = '' if 'latency' not in result else f'{result["latency"]:.3f}' tokenization_info = format_tokenization_info(result) if 'error' in result: count = result['error'] else: count = f'{result["count"]:,}' return latency, tokenization_info, count def prob(corpus_desc, engine_desc, query, request: gr.Request): result = process('prob', corpus_desc, engine_desc, query, None, request) latency = '' if 'latency' not in result else f'{result["latency"]:.3f}' tokenization_info = format_tokenization_info(result) if 'error' in result: prob = result['error'] elif result['prompt_cnt'] == 0: prob = '(n-1)-gram is not found in the corpus' else: prob = f'{result["prob"]:.4f} ({result["cont_cnt"]:,} / {result["prompt_cnt"]:,})' return latency, tokenization_info, prob def ntd(corpus_desc, engine_desc, query, request: gr.Request): result = process('ntd', corpus_desc, engine_desc, query, None, request) latency = '' if 'latency' not in result else f'{result["latency"]:.3f}' tokenization_info = format_tokenization_info(result) if 'error' in result: ntd = result['error'] else: result_by_token_id = result['result_by_token_id'] ntd = {} for token_id, r in result_by_token_id.items(): ntd[f'{r["token"]} ({r["cont_cnt"]} / {result["prompt_cnt"]})'] = r['prob'] if ntd == {}: ntd = '(n-1)-gram is not found in the corpus' return latency, tokenization_info, ntd def infgram_prob(corpus_desc, engine_desc, query, request: gr.Request): result = process('infgram_prob', corpus_desc, engine_desc, query, None, request) latency = '' if 'latency' not in result else f'{result["latency"]:.3f}' tokenization_info = format_tokenization_info(result) if 'error' in result: longest_suffix = '' prob = result['error'] else: longest_suffix = result['longest_suffix'] prob = f'{result["prob"]:.4f} ({result["cont_cnt"]:,} / {result["prompt_cnt"]:,})' return latency, tokenization_info, longest_suffix, prob def infgram_ntd(corpus_desc, engine_desc, query, request: gr.Request): result = process('infgram_ntd', corpus_desc, engine_desc, query, None, request) latency = '' if 'latency' not in result else f'{result["latency"]:.3f}' tokenization_info = format_tokenization_info(result) if 'error' in result: longest_suffix = '' ntd = result['error'] else: longest_suffix = result['longest_suffix'] result_by_token_id = result['result_by_token_id'] ntd = {} for token_id, r in result_by_token_id.items(): ntd[f'{r["token"]} ({r["cont_cnt"]} / {result["prompt_cnt"]})'] = r['prob'] return latency, tokenization_info, longest_suffix, ntd def search_docs(corpus_desc, engine_desc, query, maxnum, request: gr.Request): result = process('search_docs', corpus_desc, engine_desc, query, maxnum, request) latency = '' if 'latency' not in result else f'{result["latency"]:.3f}' tokenization_info = format_tokenization_info(result) if 'error' in result: message = result['error'] docs = [[] for _ in range(10)] else: message = result['message'] docs = result['documents'] docs = [format_doc(doc) for doc in docs] docs = docs[:maxnum] while len(docs) < 10: docs.append([]) return latency, tokenization_info, message, docs[0], docs[1], docs[2], docs[3], docs[4], docs[5], docs[6], docs[7], docs[8], docs[9] def analyze_document(corpus_desc, engine_desc, query, request: gr.Request): result = process('analyze_document', corpus_desc, engine_desc, query, None, request) return result.get('latency', ''), result.get('html', '') with gr.Blocks() as demo: with gr.Column(): gr.HTML( '''

Infini-gram: An Engine for n-gram / ∞-gram Language Modeling with Trillion-Token Corpora

This is an engine that processes n-gram / ∞-gram queries on massive text corpora. Please first select the corpus and the type of query, then enter your query and submit.

The engine is developed by Jiacheng (Gary) Liu and documented in our paper: Infini-gram: Scaling Unbounded n-gram Language Models to a Trillion Tokens. Feel free to check out our Project Homepage.

API Endpoint: If you'd like to issue batch queries to infini-gram, you may invoke our API endpoint. Please refer to the API documentation.

Note: The query is case-sensitive. Your query will be tokenized with the Llama-2 tokenizer (unless otherwise specified).

''' ) with gr.Row(): with gr.Column(scale=1): corpus_desc = gr.Radio(choices=CORPUS_DESCS, label='Corpus', value=CORPUS_DESCS[0]) engine_desc = gr.Radio(choices=ENGINE_DESCS, label='Engine', value=ENGINE_DESCS[0]) with gr.Column(scale=5): with gr.Tab('1. Count an n-gram'): with gr.Column(): gr.HTML('

1. Count an n-gram

') gr.HTML('

This counts the number of times an n-gram appears in the corpus. If you submit an empty input, it will return the total number of tokens in the corpus.

') gr.HTML('

Example query: natural language processing (the output is Cnt(natural language processing))

') with gr.Row(): with gr.Column(scale=1): count_query = gr.Textbox(placeholder='Enter a string (an n-gram) here', label='Query', interactive=True) with gr.Row(): count_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True) count_submit = gr.Button(value='Submit', variant='primary', visible=True) count_latency = gr.Textbox(label='Latency (milliseconds)', interactive=False, lines=1) count_tokenized = gr.Textbox(label='Tokenized', lines=2, interactive=False) with gr.Column(scale=1): count_count = gr.Label(label='Count', num_top_classes=0) count_clear.add([count_query, count_latency, count_tokenized, count_count]) count_submit.click(count, inputs=[corpus_desc, engine_desc, count_query], outputs=[count_latency, count_tokenized, count_count], api_name=False) with gr.Tab('2. Prob of the last token'): with gr.Column(): gr.HTML('

2. Compute the probability of the last token in an n-gram

') gr.HTML('

This computes the n-gram probability of the last token conditioned on the previous tokens (i.e. (n-1)-gram)).

') gr.HTML('

Example query: natural language processing (the output is P(processing | natural language), by counting the appearance of the 3-gram "natural language processing" and the 2-gram "natural language", and take the division between the two)

') gr.HTML('

Note: The (n-1)-gram needs to exist in the corpus. If the (n-1)-gram is not found in the corpus, an error message will appear.

') with gr.Row(): with gr.Column(scale=1): prob_query = gr.Textbox(placeholder='Enter a string (an n-gram) here', label='Query', interactive=True) with gr.Row(): prob_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True) prob_submit = gr.Button(value='Submit', variant='primary', visible=True) prob_latency = gr.Textbox(label='Latency (milliseconds)', interactive=False, lines=1) prob_tokenized = gr.Textbox(label='Tokenized', lines=2, interactive=False) with gr.Column(scale=1): prob_probability = gr.Label(label='Probability', num_top_classes=0) prob_clear.add([prob_query, prob_latency, prob_tokenized, prob_probability]) prob_submit.click(prob, inputs=[corpus_desc, engine_desc, prob_query], outputs=[prob_latency, prob_tokenized, prob_probability], api_name=False) with gr.Tab('3. Next-token distribution'): with gr.Column(): gr.HTML('

3. Compute the next-token distribution of an (n-1)-gram

') gr.HTML('

This is an extension of the Query 2: It interprets your input as the (n-1)-gram and gives you the full next-token distribution.

') gr.HTML('

Example query: natural language (the output is P(* | natural language), for the top-10 tokens *)

') gr.HTML(f'

Note: The (n-1)-gram needs to exist in the corpus. If the (n-1)-gram is not found in the corpus, an error message will appear. If the (n-1)-gram appears more than {MAX_CNT_FOR_NTD} times in the corpus, the result will be approximate.

') with gr.Row(): with gr.Column(scale=1): ntd_query = gr.Textbox(placeholder='Enter a string (an (n-1)-gram) here', label='Query', interactive=True) with gr.Row(): ntd_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True) ntd_submit = gr.Button(value='Submit', variant='primary', visible=True) ntd_latency = gr.Textbox(label='Latency (milliseconds)', interactive=False, lines=1) ntd_tokenized = gr.Textbox(label='Tokenized', lines=2, interactive=False) with gr.Column(scale=1): ntd_distribution = gr.Label(label='Distribution', num_top_classes=10) ntd_clear.add([ntd_query, ntd_latency, ntd_tokenized, ntd_distribution]) ntd_submit.click(ntd, inputs=[corpus_desc, engine_desc, ntd_query], outputs=[ntd_latency, ntd_tokenized, ntd_distribution], api_name=False) with gr.Tab('4. ∞-gram prob'): with gr.Column(): gr.HTML('

4. Compute the ∞-gram probability of the last token

') gr.HTML('

This computes the ∞-gram probability of the last token conditioned on the previous tokens. Compared to Query 2 (which uses your entire input for n-gram modeling), here we take the longest suffix that we can find in the corpus.

') gr.HTML('

Example query: I love natural language processing (the output is P(processing | natural language), because "natural language" appears in the corpus but "love natural language" doesn\'t; in this case the effective n = 3)

') gr.HTML('

Note: It may be possible that the effective n = 1, in which case it reduces to the uni-gram probability of the last token.

') with gr.Row(): with gr.Column(scale=1): infgram_prob_query = gr.Textbox(placeholder='Enter a string here', label='Query', interactive=True) with gr.Row(): infgram_prob_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True) infgram_prob_submit = gr.Button(value='Submit', variant='primary', visible=True) infgram_prob_latency = gr.Textbox(label='Latency (milliseconds)', interactive=False, lines=1) infgram_prob_tokenized = gr.Textbox(label='Tokenized', lines=2, interactive=False) infgram_prob_longest_suffix = gr.Textbox(label='Longest Found Suffix', interactive=False) with gr.Column(scale=1): infgram_prob_probability = gr.Label(label='Probability', num_top_classes=0) infgram_prob_clear.add([infgram_prob_query, infgram_prob_latency, infgram_prob_tokenized, infgram_prob_longest_suffix, infgram_prob_probability]) infgram_prob_submit.click(infgram_prob, inputs=[corpus_desc, engine_desc, infgram_prob_query], outputs=[infgram_prob_latency, infgram_prob_tokenized, infgram_prob_longest_suffix, infgram_prob_probability], api_name=False) with gr.Tab('5. ∞-gram next-token distribution'): with gr.Column(): gr.HTML('

5. Compute the ∞-gram next-token distribution

') gr.HTML('

This is similar to Query 3, but with ∞-gram instead of n-gram.

') gr.HTML('

Example query: I love natural language (the output is P(* | natural language), for the top-10 tokens *)

') with gr.Row(): with gr.Column(scale=1): infgram_ntd_query = gr.Textbox(placeholder='Enter a string here', label='Query', interactive=True) with gr.Row(): infgram_ntd_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True) infgram_ntd_submit = gr.Button(value='Submit', variant='primary', visible=True) infgram_ntd_latency = gr.Textbox(label='Latency (milliseconds)', interactive=False, lines=1) infgram_ntd_tokenized = gr.Textbox(label='Tokenized', lines=2, interactive=False) infgram_ntd_longest_suffix = gr.Textbox(label='Longest Found Suffix', interactive=False) with gr.Column(scale=1): infgram_ntd_distribution = gr.Label(label='Distribution', num_top_classes=10) infgram_ntd_clear.add([infgram_ntd_query, infgram_ntd_latency, infgram_ntd_tokenized, infgram_ntd_longest_suffix, infgram_ntd_distribution]) infgram_ntd_submit.click(infgram_ntd, inputs=[corpus_desc, engine_desc, infgram_ntd_query], outputs=[infgram_ntd_latency, infgram_ntd_tokenized, infgram_ntd_longest_suffix, infgram_ntd_distribution], api_name=False) with gr.Tab('6. Search documents'): with gr.Column(): gr.HTML(f'''

6. Search for documents containing n-gram(s)

This displays a few random documents in the corpus that satisfies your query. You can simply enter an n-gram, in which case the document displayed would contain your n-gram. You can also connect multiple n-gram terms with the AND/OR operators, in the CNF format, in which case the displayed document contains n-grams such that it satisfies this logical constraint.

Example queries:

If you want another batch of random documents, simply hit the Submit button again :)

A few notes:

❗️WARNING: Corpus may contain problematic contents such as PII, toxicity, hate speech, and NSFW text. This tool is merely presenting selected text from the corpus, without any post-hoc safety filtering. It is NOT creating new text. This is a research prototype through which we can expose and examine existing problems with massive text corpora. Please use with caution. Don't be evil :)

''') with gr.Row(): with gr.Column(scale=2): search_docs_query = gr.Textbox(placeholder='Enter a query here', label='Query', interactive=True) search_docs_maxnum = gr.Slider(minimum=1, maximum=10, value=1, step=1, label='Number of documents to Display') with gr.Row(): search_docs_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True) search_docs_submit = gr.Button(value='Submit', variant='primary', visible=True) search_docs_latency = gr.Textbox(label='Latency (milliseconds)', interactive=False, lines=1) search_docs_tokenized = gr.Textbox(label='Tokenized', lines=2, interactive=False) with gr.Column(scale=3): search_docs_message = gr.Label(label='Message', num_top_classes=0) with gr.Tab(label='1'): search_docs_output_0 = gr.HighlightedText(label='Document', show_legend=False, color_map={"-": "red", "0": "green", "1": "cyan", "2": "blue", "3": "magenta"}) with gr.Tab(label='2'): search_docs_output_1 = gr.HighlightedText(label='Document', show_legend=False, color_map={"-": "red", "0": "green", "1": "cyan", "2": "blue", "3": "magenta"}) with gr.Tab(label='3'): search_docs_output_2 = gr.HighlightedText(label='Document', show_legend=False, color_map={"-": "red", "0": "green", "1": "cyan", "2": "blue", "3": "magenta"}) with gr.Tab(label='4'): search_docs_output_3 = gr.HighlightedText(label='Document', show_legend=False, color_map={"-": "red", "0": "green", "1": "cyan", "2": "blue", "3": "magenta"}) with gr.Tab(label='5'): search_docs_output_4 = gr.HighlightedText(label='Document', show_legend=False, color_map={"-": "red", "0": "green", "1": "cyan", "2": "blue", "3": "magenta"}) with gr.Tab(label='6'): search_docs_output_5 = gr.HighlightedText(label='Document', show_legend=False, color_map={"-": "red", "0": "green", "1": "cyan", "2": "blue", "3": "magenta"}) with gr.Tab(label='7'): search_docs_output_6 = gr.HighlightedText(label='Document', show_legend=False, color_map={"-": "red", "0": "green", "1": "cyan", "2": "blue", "3": "magenta"}) with gr.Tab(label='8'): search_docs_output_7 = gr.HighlightedText(label='Document', show_legend=False, color_map={"-": "red", "0": "green", "1": "cyan", "2": "blue", "3": "magenta"}) with gr.Tab(label='9'): search_docs_output_8 = gr.HighlightedText(label='Document', show_legend=False, color_map={"-": "red", "0": "green", "1": "cyan", "2": "blue", "3": "magenta"}) with gr.Tab(label='10'): search_docs_output_9 = gr.HighlightedText(label='Document', show_legend=False, color_map={"-": "red", "0": "green", "1": "cyan", "2": "blue", "3": "magenta"}) search_docs_clear.add([search_docs_query, search_docs_latency, search_docs_tokenized, search_docs_message, search_docs_output_0, search_docs_output_1, search_docs_output_2, search_docs_output_3, search_docs_output_4, search_docs_output_5, search_docs_output_6, search_docs_output_7, search_docs_output_8, search_docs_output_9]) search_docs_submit.click(search_docs, inputs=[corpus_desc, engine_desc, search_docs_query, search_docs_maxnum], outputs=[search_docs_latency, search_docs_tokenized, search_docs_message, search_docs_output_0, search_docs_output_1, search_docs_output_2, search_docs_output_3, search_docs_output_4, search_docs_output_5, search_docs_output_6, search_docs_output_7, search_docs_output_8, search_docs_output_9], api_name=False) with gr.Tab('7. Analyze an (AI-generated) document using ∞-gram', visible=False): with gr.Column(): gr.HTML('

7. Analyze an (AI-generated) document using ∞-gram

') gr.HTML('

This analyzes the document you entered using the ∞-gram. Each token is highlighted where (1) the color represents its ∞-gram probability (red is 0.0, blue is 1.0), and (2) the alpha represents the effective n (higher alpha means higher n).

') gr.HTML('

If you hover over a token, the tokens preceding it are each highlighted where (1) the color represents the n-gram probability of your selected token, with the n-gram starting from that highlighted token (red is 0.0, blue is 1.0), and (2) the alpha represents the count of the (n-1)-gram starting from that highlighted token (and up to but excluding your selected token) (higher alpha means higher count).

') with gr.Row(): with gr.Column(scale=1): analyze_document_query = gr.Textbox(placeholder='Enter a document here', label='Query', interactive=True, lines=10) with gr.Row(): analyze_document_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True) analyze_document_submit = gr.Button(value='Submit', variant='primary', visible=True) with gr.Column(scale=1): analyze_document_html = gr.HTML(value='', label='Analysis') analyze_document_clear.add([analyze_document_query, analyze_document_html]) analyze_document_submit.click(analyze_document, inputs=[corpus_desc, engine_desc, analyze_document_query], outputs=[analyze_document_html], api_name=False) with gr.Row(): gr.Markdown(''' If you find this tool useful, please kindly cite our paper: ```bibtex @article{Liu2024InfiniGram, title={Infini-gram: Scaling Unbounded n-gram Language Models to a Trillion Tokens}, author={Liu, Jiacheng and Min, Sewon and Zettlemoyer, Luke and Choi, Yejin and Hajishirzi, Hannaneh}, journal={arXiv preprint arXiv:2401.17377}, year={2024} } ``` ''') for d in demo.dependencies: d['api_name'] = False for d in demo.config['dependencies']: d['api_name'] = False # if DEBUG: # print(demo.dependencies) # print(demo.config['dependencies']) demo.queue( default_concurrency_limit=DEFAULT_CONCURRENCY_LIMIT, max_size=MAX_SIZE, api_open=False, ).launch( max_threads=MAX_THREADS, debug=DEBUG, show_api=False, ) # for d in gr.context.Context.root_block.dependencies: # d['api_name'] = False # if DEBUG: # print(gr.context.Context.root_block.dependencies)