Spaces:
Runtime error
Runtime error
File size: 4,790 Bytes
129cd69 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
from typing import Any, Dict, Iterator, Optional
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseBlobParser
from langchain.document_loaders.blob_loaders import Blob
from langchain.document_loaders.parsers.language.cobol import CobolSegmenter
from langchain.document_loaders.parsers.language.javascript import JavaScriptSegmenter
from langchain.document_loaders.parsers.language.python import PythonSegmenter
from langchain.text_splitter import Language
LANGUAGE_EXTENSIONS: Dict[str, str] = {
"py": Language.PYTHON,
"js": Language.JS,
"cobol": Language.COBOL,
}
LANGUAGE_SEGMENTERS: Dict[str, Any] = {
Language.PYTHON: PythonSegmenter,
Language.JS: JavaScriptSegmenter,
Language.COBOL: CobolSegmenter,
}
class LanguageParser(BaseBlobParser):
"""Parse using the respective programming language syntax.
Each top-level function and class in the code is loaded into separate documents.
Furthermore, an extra document is generated, containing the remaining top-level code
that excludes the already segmented functions and classes.
This approach can potentially improve the accuracy of QA models over source code.
Currently, the supported languages for code parsing are Python and JavaScript.
The language used for parsing can be configured, along with the minimum number of
lines required to activate the splitting based on syntax.
Examples:
.. code-block:: python
from langchain.text_splitter.Language
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.parsers import LanguageParser
loader = GenericLoader.from_filesystem(
"./code",
glob="**/*",
suffixes=[".py", ".js"],
parser=LanguageParser()
)
docs = loader.load()
Example instantiations to manually select the language:
.. code-block:: python
from langchain.text_splitter import Language
loader = GenericLoader.from_filesystem(
"./code",
glob="**/*",
suffixes=[".py"],
parser=LanguageParser(language=Language.PYTHON)
)
Example instantiations to set number of lines threshold:
.. code-block:: python
loader = GenericLoader.from_filesystem(
"./code",
glob="**/*",
suffixes=[".py"],
parser=LanguageParser(parser_threshold=200)
)
"""
def __init__(self, language: Optional[Language] = None, parser_threshold: int = 0):
"""
Language parser that split code using the respective language syntax.
Args:
language: If None (default), it will try to infer language from source.
parser_threshold: Minimum lines needed to activate parsing (0 by default).
"""
self.language = language
self.parser_threshold = parser_threshold
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
code = blob.as_string()
language = self.language or (
LANGUAGE_EXTENSIONS.get(blob.source.rsplit(".", 1)[-1])
if isinstance(blob.source, str)
else None
)
if language is None:
yield Document(
page_content=code,
metadata={
"source": blob.source,
},
)
return
if self.parser_threshold >= len(code.splitlines()):
yield Document(
page_content=code,
metadata={
"source": blob.source,
"language": language,
},
)
return
self.Segmenter = LANGUAGE_SEGMENTERS[language]
segmenter = self.Segmenter(blob.as_string())
if not segmenter.is_valid():
yield Document(
page_content=code,
metadata={
"source": blob.source,
},
)
return
for functions_classes in segmenter.extract_functions_classes():
yield Document(
page_content=functions_classes,
metadata={
"source": blob.source,
"content_type": "functions_classes",
"language": language,
},
)
yield Document(
page_content=segmenter.simplify_code(),
metadata={
"source": blob.source,
"content_type": "simplified_code",
"language": language,
},
)
|