Spaces:

jerome-white
/

openai-file-chat

Runtime error

App Files Files Community

jerome-white commited on Aug 19

Commit

ef3d4ad

•

1 Parent(s): 9848c62

Initial commit

Browse files

Files changed (9) hide show

app.py +91 -0
mylib/__init__.py +5 -0
mylib/_chat.py +86 -0
mylib/_citations.py +47 -0
mylib/_files.py +122 -0
mylib/_logging.py +13 -0
mylib/_message.py +20 -0
prompts/system.txt +1 -0
requirements.txt +3 -0

app.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import os
+import json
+from tempfile import NamedTemporaryFile
+import gradio as gr
+from openai import OpenAI
+from mylib import (
+    Logger,
+    FileManager,
+    ChatController,
+    MessageHandler,
+    NumericCitations,
+)
+#
+#
+#
+class ErrorLogger:
+    def __init__(self, path):
+        self.path = path
+        if not self.path.exists():
+            self.path.mkdir(parents=True, exist_ok=True)
+    def dump(self, prompt, error):
+        msg = {
+            'prompt': prompt,
+        }
+        msg.update(error.to_dict())
+        output = json.dumps(msg, indent=2)
+        with NamedTemporaryFile(mode='w',
+                                prefix='',
+                                dir=self.path,
+                                delete=False) as fp:
+            print(output, file=fp)
+            return fp.name
+#
+#
+#
+class FileChat:
+    def __init__(self, client, config):
+        self.database = FileManager(client, config['chat']['prefix'])
+        self.messenger = MessageHandler(client, NumericCitations)
+        self.chat = ChatController(client, config['openai'], config['chat'])
+    def upload(self, *args):
+        (data, ) = args
+        return self.database(data)
+    def prompt(self, *args):
+        (message, *_) = args
+        if not self.database:
+            raise gr.Error('Please upload your documents to begin')
+        return self.messenger(self.chat(message, self.database))
+#
+#
+#
+with open(os.getenv('FILE_CHAT_CONFIG')) as fp:
+    config = json.load(fp)
+with gr.Blocks() as demo:
+    client = OpenAI(api_key=config['openai']['api_key'])
+    mychat = FileChat(client, config)
+    with gr.Row():
+        upload = gr.UploadButton(file_count='multiple')
+        text = gr.Textbox(label='Files uploaded', interactive=False)
+        upload.upload(mychat.upload, upload, text)
+    gr.ChatInterface(
+        fn=mychat.prompt,
+        additional_inputs=[
+            upload,
+            text,
+        ],
+        retry_btn=None,
+        undo_btn=None,
+        clear_btn=None,
+        # additional_inputs_accordion=gr.Accordion(
+        #     label='Upload documents',
+        #     open=True,
+        # ),
+    )
+if __name__ == '__main__':
+    # demo.queue().launch(server_name='0.0.0.0', **config['gradio'])
+    demo.queue().launch(server_name='0.0.0.0')

mylib/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from ._chat import ChatController
+from ._files import FileManager
+from ._logging import Logger
+from ._message import MessageHandler
+from ._citations import NumericCitations, NoCitations

mylib/_chat.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import math
+import time
+from pathlib import Path
+import pandas as pd
+from ._logging import Logger
+def parse_wait_time(err):
+    if err.code == 'rate_limit_exceeded':
+        for i in err.message.split('. '):
+            if i.startswith('Please try again in'):
+                (*_, wait) = i.split()
+                return (pd
+                        .to_timedelta(wait)
+                        .total_seconds())
+    raise TypeError(err.code)
+class ChatController:
+    _gpt_defaults = {
+        'model': 'gpt-4o',
+        'max_completion_tokens': 2 ** 12,
+    }
+    def __init__(self, client, gpt, chat):
+        self.client = client
+        self.gpt = gpt
+        self.chat = chat
+        for i in self._gpt_defaults.items():
+            self.gpt.setdefault(*i)
+        instructions = Path(self.chat['system_prompt'])
+        self.assistant = self.client.beta.assistants.create(
+            name=self.gpt['assistant_name'],
+            model=self.gpt['model'],
+            instructions=instructions.read_text(),
+            temperature=0.1,
+            tools=[{
+                'type': 'file_search',
+            }],
+        )
+        self.thread = self.client.beta.threads.create()
+        self.attached = False
+    def __call__(self, prompt, database):
+        if not self.attached:
+            self.client.beta.assistants.update(
+                assistant_id=self.assistant.id,
+                tool_resources={
+                    'file_search': {
+                        'vector_store_ids': [
+                            database.vector_store_id,
+                        ],
+                    },
+                },
+            )
+            self.attached = True
+        return self.send(prompt)
+    def send(self, content):
+        self.client.beta.threads.messages.create(
+            self.thread.id,
+            role='user',
+            content=content,
+        )
+        for i in range(self.chat['retries']):
+            run = self.client.beta.threads.runs.create_and_poll(
+                thread_id=self.thread.id,
+                assistant_id=self.assistant.id,
+            )
+            if run.status == 'completed':
+                return self.client.beta.threads.messages.list(
+                    thread_id=self.thread.id,
+                    run_id=run.id,
+                )
+            Logger.error('%s (%d): %s', run.status, i + 1, run.last_error)
+            rest = math.ceil(parse_wait_time(run.last_error))
+            Logger.warning('Sleeping %ds', rest)
+            time.sleep(rest)
+        raise TimeoutError('Message retries exceeded')

mylib/_citations.py ADDED Viewed

	@@ -0,0 +1,47 @@

+class CitationManager:
+    def __init__(self, annotations, client, start=1):
+        self.start = start
+        self.body = {}
+        self.citations = []
+        for a in annotations:
+            reference = f'[{start}]'
+            self.body[a.text] = reference
+            document = client.files.retrieve(a.file_citation.file_id)
+            self.citations.append('{} {}:{}--{}'.format(
+                reference,
+                document.filename,
+                a.start_index,
+                a.end_index,
+            ))
+            start += 1
+    def __len__(self):
+        return len(self.citations)
+    def __str__(self):
+        raise NotImplementedError()
+    def __iter__(self):
+        raise NotImplementedError()
+    def replace(self, body):
+        for i in self:
+            body = body.replace(*i)
+        return body
+class NumericCitations(CitationManager):
+    def __str__(self):
+        return '\n\n{}'.format('\n'.join(self.citations))
+    def __iter__(self):
+        for (k, v) in self.body.items():
+            yield (k, f' {v}')
+class NoCitations(CitationManager):
+    def __str__(self):
+        return ''
+    def __iter__(self):
+        yield from zip(self.body, it.repeat(''))

mylib/_files.py ADDED Viewed

	@@ -0,0 +1,122 @@

+import uuid
+import hashlib
+import warnings
+import itertools as it
+import functools as ft
+from pathlib import Path
+class FileObject:
+    _window = 20
+    def __init__(self, path):
+        self.fp = path.open('rb')
+        self.chunk = 2 ** self._window
+    def close(self):
+        self.fp.close()
+    @ft.cached_property
+    def checksum(self):
+        csum = hashlib.blake2b()
+        while True:
+            data = self.fp.read(self.chunk)
+            if not data:
+                break
+            csum.update(data)
+        self.fp.seek(0)
+        return csum.hexdigest()
+class FileStream:
+    def __init__(self, paths):
+        self.paths = paths
+        self.streams = []
+    def __len__(self):
+        return len(self.streams)
+    def __iter__(self):
+        for p in self.paths:
+            stream = FileObject(p)
+            self.streams.append(stream)
+            yield stream
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc_value, traceback):
+        for s in self.streams:
+            s.close()
+        self.streams.clear()
+class FileManager:
+    def __init__(self, client, prefix, batch_size=20):
+        self.client = client
+        self.prefix = prefix
+        self.batch_size = batch_size
+        self.storage = set()
+        self.vector_store_id = None
+    def __bool__(self):
+        return self.vector_store_id is not None
+    def __iter__(self):
+        if self:
+            kwargs = {}
+            while True:
+                vs_files = self.client.beta.vector_stores.files.list(
+                    vector_store_id=self.vector_store_id,
+                    **kwargs,
+                )
+                for f in vs_files.data:
+                    result = self.client.files.retrieve(f.id)
+                    yield result.filename
+                if not vs_files.has_more:
+                    break
+                kwargs['after'] = vs_files.after
+    def __call__(self, paths):
+        files = []
+        self.test_and_setup()
+        for p in self.ls(paths):
+            with FileStream(p) as stream:
+                for s in stream:
+                    if s.checksum not in self.storage:
+                        files.append(s.fp)
+                        self.storage.add(s.checksum)
+                if files:
+                    self.put(files)
+                    files.clear()
+        return '\n'.join(self)
+    def test_and_setup(self):
+        if self:
+            msg = f'Vector store already exists ({self.vector_store_id})'
+            warnings.warn(msg)
+        else:
+            name = f'{self.prefix}{uuid.uuid4()}'
+            vector_store = self.client.beta.vector_stores.create(
+	        name=name,
+            )
+            self.vector_store_id = vector_store.id
+    def ls(self, paths):
+        left = 0
+        while left < len(paths):
+            right = left + self.batch_size
+            yield list(map(Path, it.islice(paths, left, right)))
+            left = right
+    def put(self, files):
+        batch = self.client.beta.vector_stores.file_batches.upload_and_poll(
+            vector_store_id=self.vector_store_id,
+            files=files,
+        )
+        if batch.file_counts.completed != len(files):
+            err = f'Error uploading documents: {batch.file_counts}'
+            raise InterruptedError(err)

mylib/_logging.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import os
+import logging
+#
+#
+#
+logging.basicConfig(
+    format='[ %(asctime)s %(levelname)s %(filename)s ] %(message)s',
+    datefmt='%H:%M:%S',
+    level=os.environ.get('PYTHONLOGLEVEL', 'WARNING').upper(),
+)
+logging.captureWarnings(True)
+Logger = logging.getLogger(__name__)

mylib/_message.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from ._citations import NoCitations
+class MessageHandler:
+    def __init__(self, client, citecls=None):
+        self.client = client
+        self.citecls = citecls or NoCitations
+    def __call__(self, message):
+        return '\n'.join(self.each(message))
+    def each(self, message):
+        refn = 1
+        for m in message:
+            for c in m.content:
+                cite = self.citecls(c.text.annotations, self.client, refn)
+                body = cite.replace(c.text.value)
+                refn = len(cite) + 1
+                yield f'{body}{cite}'

prompts/system.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ You are an expert file search assistant

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+gradio
+openai
+pandas