Spaces:
Running
Running
| import mimetypes | |
| import pandas as pd | |
| import PyPDF2 | |
| import json | |
| import re | |
| import spacy | |
| import numpy as np | |
| from transformers import AutoTokenizer, AutoModel | |
| import torch | |
| import os | |
| # Load SpaCy model with a check to ensure it's downloaded | |
| try: | |
| nlp = spacy.load("en_core_web_sm") | |
| except OSError: | |
| os.system("python -m spacy download en_core_web_sm") | |
| nlp = spacy.load("en_core_web_sm") | |
| # Detect file type | |
| def detect_file_type(file_path): | |
| file_type = mimetypes.guess_type(file_path)[0] | |
| if file_type in ["application/pdf"]: | |
| return "pdf" | |
| elif file_type in ["text/csv", "application/vnd.ms-excel"]: | |
| return "csv" | |
| elif file_type == "application/json": | |
| return "json" | |
| else: | |
| raise ValueError(f"Unsupported file format: {file_type}") | |
| # Extract text from CSV | |
| def extract_text_from_csv(file_path): | |
| df = pd.read_csv(file_path) | |
| text = " ".join(df.astype(str).stack()) | |
| return text | |
| # Extract text from PDF | |
| def extract_text_from_pdf(file_path): | |
| pdf_reader = PyPDF2.PdfReader(file_path) | |
| text = "" | |
| for page in pdf_reader.pages: | |
| text += page.extract_text() | |
| return text | |
| # Extract text from JSON | |
| def extract_text_from_json(file_path): | |
| def recursive_text_extraction(data): | |
| if isinstance(data, dict): | |
| return " ".join(recursive_text_extraction(value) for value in data.values()) | |
| elif isinstance(data, list): | |
| return " ".join(recursive_text_extraction(item) for item in data) | |
| else: | |
| return str(data) | |
| with open(file_path, 'r') as f: | |
| data = json.load(f) | |
| return recursive_text_extraction(data) | |
| # Generalized text extraction | |
| def extract_text(file_path): | |
| file_type = detect_file_type(file_path) | |
| if file_type == "csv": | |
| return extract_text_from_csv(file_path) | |
| elif file_type == "pdf": | |
| return extract_text_from_pdf(file_path) | |
| elif file_type == "json": | |
| return extract_text_from_json(file_path) | |
| else: | |
| raise ValueError("Unsupported file format") | |
| # Preprocess text | |
| def preprocess_text_generalized(text): | |
| text = re.sub(r"http\S+|www\S+|https\S+", "", text) | |
| text = re.sub(r"[^\x20-\x7E]", "", text) | |
| text = re.sub(r"\s+", " ", text) | |
| chunk_size = 100000 | |
| chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)] | |
| processed_chunks = [] | |
| for chunk in chunks: | |
| doc = nlp(chunk.lower()) | |
| tokens = [ | |
| token.lemma_ | |
| for token in doc | |
| if not token.is_stop and token.is_alpha | |
| ] | |
| processed_chunks.append(" ".join(tokens)) | |
| processed_text = " ".join(processed_chunks) | |
| return processed_text | |
| # Generate embeddings | |
| def get_embeddings_from_huggingface(cleaned_text, model_name="sentence-transformers/all-MiniLM-L6-v2"): | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| model = AutoModel.from_pretrained(model_name) | |
| inputs = tokenizer(cleaned_text, return_tensors="pt", truncation=True, padding=True, max_length=512) | |
| with torch.no_grad(): | |
| outputs = model(**inputs) | |
| embeddings = outputs.last_hidden_state | |
| sentence_embeddings = embeddings.mean(dim=1).numpy() | |
| return sentence_embeddings | |