Spaces:

kgauvin603
/

rag-10k-analysis

Sleeping

App Files Files Community

rag-10k-analysis / old_app.py

kgauvin603

Rename app.py to old_app.py

9513235 verified 2 months ago

raw

history blame

No virus

4.09 kB

	# Import the necessary libraries
	import subprocess
	import sys

	# Function to install a package using pip
	def install(package):
	subprocess.check_call([sys.executable, "-m", "pip", "install", package])

	# Install required packages
	try:
	install("gradio")
	install("openai==1.23.2")
	install("tiktoken==0.6.0")
	install("pypdf==4.0.1")
	install("langchain==0.1.1")
	install("langchain-community==0.0.13")
	install("chromadb==0.4.22")
	install("sentence-transformers==2.3.1")
	except subprocess.CalledProcessError as e:
	print(f"An error occurred: {e}")

	import gradio as gr
	import os
	import uuid
	import json
	import pandas as pd
	import subprocess
	from openai import OpenAI
	from huggingface_hub import HfApi
	from huggingface_hub import CommitScheduler
	from huggingface_hub import hf_hub_download
	import zipfile
	# Define your repository and file path
	repo_id = "kgauvin603/rag-10k"
	#file_path = "dataset.zip"

	# Download the file
	#downloaded_file = hf_hub_download(repo_id, file_path)

	# Print the path to the downloaded file
	#print(f"Downloaded file is located at: {downloaded_file}")

	from langchain_community.embeddings.sentence_transformer import (
	SentenceTransformerEmbeddings
	)
	from langchain_community.vectorstores import Chroma
	#from google.colab import userdata, drive
	from pathlib import Path
	from langchain.document_loaders import PyPDFDirectoryLoader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	import json
	import tiktoken
	import pandas as pd
	import tiktoken

	print(f"Pass 1")

	# Define the embedding model and the vectorstore
	embedding_model = SentenceTransformerEmbeddings(model_name='thenlper/gte-large')

	# If dataset directory exixts, remove it and all of the contents within

	#if os.path.exists('dataset'):
	# !rm -rf dataset

	# If collection_db exists, remove it and all of the contents within

	#if os.path.exists('collection_db'):
	# !rm -rf dataset

	#Mount the Google Drive
	#drive.mount('/content/drive')

	#Upload Dataset-10k.zip and unzip it dataset folder using -d option
	#!unzip Dataset-10k.zip -d dataset

	import subprocess

	# Command to unzip the file
	#command = "unzip kgauvin603/10k-reports/Dataset-10k.zip -d dataset"
	command = "pip install transformers huggingface_hub requests"
	# Execute the command
	try:
	subprocess.run(command, check=True, shell=True)
	except subprocess.CalledProcessError as e:
	print(f"An error occurred: {e}")

	from huggingface_hub import hf_hub_download
	import zipfile
	import os
	import requests

	print(f"Pass 2")


	#repo_id = "kgauvin603/10k-reports"
	#file_path = "dataset"
	# Get the URL for the file in the repository
	#file_url = f"https://huggingface.co/{repo_id}/resolve/main/{file_path}"
	#print(file_url)
	# Command to unzip the file
	#command = "unzip kgauvin603/10k-reports/Dataset-10k.zip -d dataset"
	# Execute the command
	#try:
	# subprocess.run(command, check=True, shell=True)
	#except subprocess.CalledProcessError as e:
	# print(f"An error occurred: {e}")

	#https://huggingface.co/datasets/kgauvin603/10k-reports

	# Define the repository and file path
	repo_id = "kgauvin603/10k-reports"
	file_path = "Dataset-10k.zip"

	# Construct the URL for the file in the repository
	file_url = f"https://huggingface.co/datasets/{repo_id}/{file_path}"
	print(f"File URL: {file_url}")

	# Download the zip file
	response = requests.get(file_url)
	response.raise_for_status() # Ensure the request was successful

	# Unzip the file in memory
	with zipfile.ZipFile(io.BytesIO(response.content)) as zip_ref:
	# List the files in the zip archive
	zip_file_list = zip_ref.namelist()
	print(f"Files in the zip archive: {zip_file_list}")

	# Extract specific files or work with them directly in memory
	for file_name in zip_file_list:
	with zip_ref.open(file_name) as file:
	content = file.read()
	print(f"Content of {file_name}: {content[:100]}...") # Print the first 100 characters of each file

	# If you need to save the extracted files to disk, you can do so as follows:
	# Define the extraction path
	extraction_path = "./dataset"
	import os