silvanocerza commited on
Commit
6e812cd
1 Parent(s): 0a4996a

Go back to cloning with git and fix related issues

Browse files
Files changed (3) hide show
  1. main.py +21 -19
  2. packages.txt +0 -1
  3. requirements.txt +0 -1
main.py CHANGED
@@ -1,8 +1,7 @@
1
  from typing import List, Tuple
2
  from pathlib import Path
3
  import os
4
- import zipfile
5
- import io
6
 
7
  from dotenv import load_dotenv
8
  from haystack.preview import Pipeline
@@ -19,7 +18,6 @@ from haystack.preview.components.writers import DocumentWriter
19
  from haystack.preview.components.file_converters import TextFileToDocument
20
  from haystack.preview.document_stores.memory import MemoryDocumentStore
21
  import streamlit as st
22
- import requests
23
 
24
  # Load the environment variables, we're going to need it for OpenAI
25
  load_dotenv()
@@ -64,30 +62,34 @@ DOCUMENTATIONS = [
64
  ),
65
  ]
66
 
 
 
67
 
68
  @st.cache_data(show_spinner=False)
69
  def fetch(documentations: List[Tuple[str, str, str]]):
70
  files = []
71
- docs_path = Path(__file__).parent / "downloaded_docs"
72
- for name, url, zip_path, pattern in documentations:
 
 
73
  st.write(f"Fetching {name} repository")
74
- # All projects use `main` as the default branch
75
- branch = "main"
76
- # The name of the folder depends on the name of the repository
77
- # on GitHub plus the branch zip we're downloading
78
- repo_folder = docs_path / (url.split("/")[-1] + f"-{branch}")
79
- if not repo_folder.exists():
80
- res = requests.get(f"{url}{zip_path}", stream=True)
81
- zip = zipfile.ZipFile(io.BytesIO(res.content))
82
- # The zip file contains a folder with the name of the repository
83
- # so we extract directly into the docs folder
84
- zip.extractall(docs_path)
85
-
86
- for p in repo_folder.glob(pattern):
87
  data = {
88
  "path": p,
89
  "metadata": {
90
- "url_source": f"{url}/tree/{branch}/{p.relative_to(repo_folder)}",
91
  "suffix": p.suffix,
92
  },
93
  }
 
1
  from typing import List, Tuple
2
  from pathlib import Path
3
  import os
4
+ import subprocess
 
5
 
6
  from dotenv import load_dotenv
7
  from haystack.preview import Pipeline
 
18
  from haystack.preview.components.file_converters import TextFileToDocument
19
  from haystack.preview.document_stores.memory import MemoryDocumentStore
20
  import streamlit as st
 
21
 
22
  # Load the environment variables, we're going to need it for OpenAI
23
  load_dotenv()
 
62
  ),
63
  ]
64
 
65
+ DOCS_PATH = Path(__file__).parent / "downloaded_docs"
66
+
67
 
68
  @st.cache_data(show_spinner=False)
69
  def fetch(documentations: List[Tuple[str, str, str]]):
70
  files = []
71
+ # Create the docs path if it doesn't exist
72
+ DOCS_PATH.mkdir(parents=True, exist_ok=True)
73
+
74
+ for name, url, pattern in documentations:
75
  st.write(f"Fetching {name} repository")
76
+ repo = DOCS_PATH / name
77
+ # Attempt cloning only if it doesn't exist
78
+ if not repo.exists():
79
+ subprocess.run(["git", "clone", "--depth", "1", url, str(repo)], check=True)
80
+ res = subprocess.run(
81
+ ["git", "rev-parse", "--abbrev-ref", "HEAD"],
82
+ check=True,
83
+ capture_output=True,
84
+ encoding="utf-8",
85
+ cwd=repo,
86
+ )
87
+ branch = res.stdout.strip()
88
+ for p in repo.glob(pattern):
89
  data = {
90
  "path": p,
91
  "metadata": {
92
+ "url_source": f"{url}/tree/{branch}/{p.relative_to(repo)}",
93
  "suffix": p.suffix,
94
  },
95
  }
packages.txt DELETED
@@ -1 +0,0 @@
1
- git
 
 
requirements.txt CHANGED
@@ -4,4 +4,3 @@ langdetect
4
  streamlit==1.27.2
5
  python-dotenv
6
  watchdog
7
- requests
 
4
  streamlit==1.27.2
5
  python-dotenv
6
  watchdog