Spaces:
Runtime error
Runtime error
from typing import Any, Dict, Iterator, Optional | |
from langchain.docstore.document import Document | |
from langchain.document_loaders.base import BaseBlobParser | |
from langchain.document_loaders.blob_loaders import Blob | |
from langchain.document_loaders.parsers.language.cobol import CobolSegmenter | |
from langchain.document_loaders.parsers.language.javascript import JavaScriptSegmenter | |
from langchain.document_loaders.parsers.language.python import PythonSegmenter | |
from langchain.text_splitter import Language | |
LANGUAGE_EXTENSIONS: Dict[str, str] = { | |
"py": Language.PYTHON, | |
"js": Language.JS, | |
"cobol": Language.COBOL, | |
} | |
LANGUAGE_SEGMENTERS: Dict[str, Any] = { | |
Language.PYTHON: PythonSegmenter, | |
Language.JS: JavaScriptSegmenter, | |
Language.COBOL: CobolSegmenter, | |
} | |
class LanguageParser(BaseBlobParser): | |
"""Parse using the respective programming language syntax. | |
Each top-level function and class in the code is loaded into separate documents. | |
Furthermore, an extra document is generated, containing the remaining top-level code | |
that excludes the already segmented functions and classes. | |
This approach can potentially improve the accuracy of QA models over source code. | |
Currently, the supported languages for code parsing are Python and JavaScript. | |
The language used for parsing can be configured, along with the minimum number of | |
lines required to activate the splitting based on syntax. | |
Examples: | |
.. code-block:: python | |
from langchain.text_splitter.Language | |
from langchain.document_loaders.generic import GenericLoader | |
from langchain.document_loaders.parsers import LanguageParser | |
loader = GenericLoader.from_filesystem( | |
"./code", | |
glob="**/*", | |
suffixes=[".py", ".js"], | |
parser=LanguageParser() | |
) | |
docs = loader.load() | |
Example instantiations to manually select the language: | |
.. code-block:: python | |
from langchain.text_splitter import Language | |
loader = GenericLoader.from_filesystem( | |
"./code", | |
glob="**/*", | |
suffixes=[".py"], | |
parser=LanguageParser(language=Language.PYTHON) | |
) | |
Example instantiations to set number of lines threshold: | |
.. code-block:: python | |
loader = GenericLoader.from_filesystem( | |
"./code", | |
glob="**/*", | |
suffixes=[".py"], | |
parser=LanguageParser(parser_threshold=200) | |
) | |
""" | |
def __init__(self, language: Optional[Language] = None, parser_threshold: int = 0): | |
""" | |
Language parser that split code using the respective language syntax. | |
Args: | |
language: If None (default), it will try to infer language from source. | |
parser_threshold: Minimum lines needed to activate parsing (0 by default). | |
""" | |
self.language = language | |
self.parser_threshold = parser_threshold | |
def lazy_parse(self, blob: Blob) -> Iterator[Document]: | |
code = blob.as_string() | |
language = self.language or ( | |
LANGUAGE_EXTENSIONS.get(blob.source.rsplit(".", 1)[-1]) | |
if isinstance(blob.source, str) | |
else None | |
) | |
if language is None: | |
yield Document( | |
page_content=code, | |
metadata={ | |
"source": blob.source, | |
}, | |
) | |
return | |
if self.parser_threshold >= len(code.splitlines()): | |
yield Document( | |
page_content=code, | |
metadata={ | |
"source": blob.source, | |
"language": language, | |
}, | |
) | |
return | |
self.Segmenter = LANGUAGE_SEGMENTERS[language] | |
segmenter = self.Segmenter(blob.as_string()) | |
if not segmenter.is_valid(): | |
yield Document( | |
page_content=code, | |
metadata={ | |
"source": blob.source, | |
}, | |
) | |
return | |
for functions_classes in segmenter.extract_functions_classes(): | |
yield Document( | |
page_content=functions_classes, | |
metadata={ | |
"source": blob.source, | |
"content_type": "functions_classes", | |
"language": language, | |
}, | |
) | |
yield Document( | |
page_content=segmenter.simplify_code(), | |
metadata={ | |
"source": blob.source, | |
"content_type": "simplified_code", | |
"language": language, | |
}, | |
) | |