Spaces:

sweepai
/

chunker

Runtime error

App Files Files Community

kevinlu1248 commited on Aug 13, 2023

Commit

d596fb5

•

1 Parent(s): f749736

made app.py better

Browse files

Files changed (1) hide show

app.py +155 -4

app.py CHANGED Viewed

@@ -1,7 +1,158 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-iface = gr.Interface(fn=greet, inputs="text", outputs="text")
-iface.launch()

+from __future__ import annotations
+import re
+import requests
+from dataclasses import dataclass
 import gradio as gr
+from tree_sitter import Tree, Node
+from tree_sitter_languages import get_parser
+def non_whitespace_len(s: str) -> int: # new len function
+    return len(re.sub("\s", "", s))
+def get_line_number(index: int, source_code: str) -> int:
+    total_chars = 0
+    for line_number, line in enumerate(source_code.splitlines(keepends=True), start=1):
+        total_chars += len(line)
+        if total_chars > index:
+            return line_number - 1
+    return line_number
+@dataclass
+class Span:
+    # Represents a slice of a string
+    start: int = 0
+    end: int = 0
+    def __post_init__(self):
+        # If end is None, set it to start
+        if self.end is None:
+            self.end = self.start
+    def extract(self, s: str) -> str:
+        # Grab the corresponding substring of string s by bytes
+        return s[self.start: self.end]
+    def extract_lines(self, s: str) -> str:
+        # Grab the corresponding substring of string s by lines
+        return "\n".join(s.splitlines()[self.start:self.end])
+    def __add__(self, other: Span | int) -> Span:
+        # e.g. Span(1, 2) + Span(2, 4) = Span(1, 4) (concatenation)
+        # There are no safety checks: Span(a, b) + Span(c, d) = Span(a, d)
+        # and there are no requirements for b = c.
+        if isinstance(other, int):
+            return Span(self.start + other, self.end + other)
+        elif isinstance(other, Span):
+            return Span(self.start, other.end)
+        else:
+            raise NotImplementedError()
+    def __len__(self) -> int:
+        # i.e. Span(a, b) = b - a
+        return self.end - self.start
+def chunk_tree(
+	tree: Tree,
+	source_code: bytes,
+	MAX_CHARS=512 * 3,
+	coalesce=50 # Any chunk less than 50 characters long gets coalesced with the next chunk
+) -> list[Span]:
+    # 1. Recursively form chunks based on the last post (https://docs.sweep.dev/blogs/chunking-2m-files)
+    def chunk_node(node: Node) -> list[Span]:
+        chunks: list[Span] = []
+        current_chunk: Span = Span(node.start_byte, node.start_byte)
+        node_children = node.children
+        for child in node_children:
+            if child.end_byte - child.start_byte > MAX_CHARS:
+                chunks.append(current_chunk)
+                current_chunk = Span(child.end_byte, child.end_byte)
+                chunks.extend(chunk_node(child))
+            elif child.end_byte - child.start_byte + len(current_chunk) > MAX_CHARS:
+                chunks.append(current_chunk)
+                current_chunk = Span(child.start_byte, child.end_byte)
+            else:
+                current_chunk += Span(child.start_byte, child.end_byte)
+        chunks.append(current_chunk)
+        return chunks
+    chunks = chunk_node(tree.root_node)
+    # 2. Filling in the gaps
+    for prev, curr in zip(chunks[:-1], chunks[1:]):
+        prev.end = curr.start
+    curr.start = tree.root_node.end_byte
+    # 3. Combining small chunks with bigger ones
+    new_chunks = []
+    current_chunk = Span(0, 0)
+    for chunk in chunks:
+        current_chunk += chunk
+        if non_whitespace_len(current_chunk.extract(source_code.decode("utf-8"))) > coalesce \
+            and "\n" in current_chunk.extract(source_code.decode("utf-8")):
+            new_chunks.append(current_chunk)
+            current_chunk = Span(chunk.end, chunk.end)
+    if len(current_chunk) > 0:
+        new_chunks.append(current_chunk)
+    # 4. Changing line numbers
+    line_chunks = [
+        Span(
+            get_line_number(chunk.start, source_code),
+            get_line_number(chunk.end, source_code)
+        )
+        for chunk in new_chunks
+    ]
+    # 5. Eliminating empty chunks
+    line_chunks = [chunk for chunk in line_chunks if len(chunk) > 0]
+    return line_chunks
+css = """
+.code_container {
+}
+"""
+def chunk_code(
+    code: str,
+    language: str,
+    MAX_CHARS: int,
+    coalesce: int
+):
+    try:
+        parser = get_parser(language)
+        tree = parser.parse(code.encode("utf-8"))
+        chunks = chunk_tree(tree, code.encode("utf-8"), MAX_CHARS=MAX_CHARS, coalesce=coalesce)
+        chunks = [chunk.extract_lines(code) for chunk in chunks]
+        return "\n\n====================\n\n".join(chunks)
+    except Exception as e:
+        return str(e)
+with gr.Blocks(css=css) as demo:
+    gr.Markdown("Start typing below and the chunked output will automatically show up.")
+    default_file = "https://raw.githubusercontent.com/sweepai/sweep/b267b613d4c706eaf959fe6789f11e9a856521d1/sweepai/handlers/on_check_suite.py"
+    default_code = requests.get(default_file).text
+    with gr.Row():
+        language = gr.Dropdown(["python", "javascript", "go", "ruby", "java", "php", "c", "cpp", "rust", "haskell"], label="Language", value="python")
+        max_chars = gr.Slider(100, 3000, 1500, label="Max Characters")
+        coalesce = gr.Slider(0, 300, 100, label="Coalesce")
+    with gr.Row():
+        inp = gr.Code(placeholder="Enter the code here", label="Code to Chunk", language=language.value, lines=60, elem_classes="code_container", value=default_code)
+        out = gr.Code(label="Chunked Code", language=language.value, lines=60, value=chunk_code(default_code, language.value, max_chars.value, coalesce.value))
+    def update_language(inp, language, max_chars, coalesce):
+        return (
+            gr.update(language=language),
+            gr.update(language=language, value=chunk_code(inp.value, language, max_chars, coalesce))
+        )
+    language.change(fn=update_language, inputs=[inp, language, max_chars, coalesce], outputs=[inp, out])
+    max_chars.change(fn=chunk_code, inputs=[inp, language, max_chars, coalesce], outputs=out)
+    coalesce.change(fn=chunk_code, inputs=[inp, language, max_chars, coalesce], outputs=out)
+    inp.change(fn=chunk_code, inputs=[inp, language, max_chars, coalesce], outputs=out)
+demo.launch()