# Import the necessary libraries import subprocess import sys # Function to install a package using pip def install(package): subprocess.check_call([sys.executable, "-m", "pip", "install", package]) # Install required packages try: install("gradio") install("openai==1.23.2") install("tiktoken==0.6.0") install("pypdf==4.0.1") install("langchain==0.1.1") install("langchain-community==0.0.13") install("chromadb==0.4.22") install("sentence-transformers==2.3.1") except subprocess.CalledProcessError as e: print(f"An error occurred: {e}") import gradio as gr import os import uuid import json import pandas as pd import subprocess from openai import OpenAI from huggingface_hub import HfApi from huggingface_hub import CommitScheduler from huggingface_hub import hf_hub_download import zipfile # Define your repository and file path repo_id = "kgauvin603/rag-10k" #file_path = "dataset.zip" # Download the file #downloaded_file = hf_hub_download(repo_id, file_path) # Print the path to the downloaded file #print(f"Downloaded file is located at: {downloaded_file}") from langchain_community.embeddings.sentence_transformer import ( SentenceTransformerEmbeddings ) from langchain_community.vectorstores import Chroma #from google.colab import userdata, drive from pathlib import Path from langchain.document_loaders import PyPDFDirectoryLoader from langchain.text_splitter import RecursiveCharacterTextSplitter import json import tiktoken import pandas as pd import tiktoken print(f"Pass 1") # Define the embedding model and the vectorstore embedding_model = SentenceTransformerEmbeddings(model_name='thenlper/gte-large') # If dataset directory exixts, remove it and all of the contents within #if os.path.exists('dataset'): # !rm -rf dataset # If collection_db exists, remove it and all of the contents within #if os.path.exists('collection_db'): # !rm -rf dataset #Mount the Google Drive #drive.mount('/content/drive') #Upload Dataset-10k.zip and unzip it dataset folder using -d option #!unzip Dataset-10k.zip -d dataset import subprocess # Command to unzip the file #command = "unzip kgauvin603/10k-reports/Dataset-10k.zip -d dataset" command = "pip install transformers huggingface_hub requests" # Execute the command try: subprocess.run(command, check=True, shell=True) except subprocess.CalledProcessError as e: print(f"An error occurred: {e}") from huggingface_hub import hf_hub_download import zipfile import os import requests print(f"Pass 2") #repo_id = "kgauvin603/10k-reports" #file_path = "dataset" # Get the URL for the file in the repository #file_url = f"https://huggingface.co/{repo_id}/resolve/main/{file_path}" #print(file_url) # Command to unzip the file #command = "unzip kgauvin603/10k-reports/Dataset-10k.zip -d dataset" # Execute the command #try: # subprocess.run(command, check=True, shell=True) #except subprocess.CalledProcessError as e: # print(f"An error occurred: {e}") #https://huggingface.co/datasets/kgauvin603/10k-reports # Define the repository and file path repo_id = "kgauvin603/10k-reports" file_path = "Dataset-10k.zip" # Construct the URL for the file in the repository file_url = f"https://huggingface.co/datasets/{repo_id}/{file_path}" print(f"File URL: {file_url}") # Download the zip file response = requests.get(file_url) response.raise_for_status() # Ensure the request was successful # Unzip the file in memory with zipfile.ZipFile(io.BytesIO(response.content)) as zip_ref: # List the files in the zip archive zip_file_list = zip_ref.namelist() print(f"Files in the zip archive: {zip_file_list}") # Extract specific files or work with them directly in memory for file_name in zip_file_list: with zip_ref.open(file_name) as file: content = file.read() print(f"Content of {file_name}: {content[:100]}...") # Print the first 100 characters of each file # If you need to save the extracted files to disk, you can do so as follows: # Define the extraction path extraction_path = "./dataset" import os