| | import joblib |
| | import pandas as pd |
| | from PIL import Image |
| | import pytesseract |
| | from scipy.sparse import hstack, csr_matrix |
| |
|
| | print("🔄 Loading model and preprocessors...") |
| |
|
| | bundle = joblib.load("/Users/vidyasagarkaruturi/Downloads/machine learning/src/document_classifier_xgb.pkl") |
| |
|
| | model = bundle["model"] |
| | word_vectorizer = bundle["word_vectorizer"] |
| | char_vectorizer = bundle["char_vectorizer"] |
| | scaler = bundle["scaler"] |
| |
|
| | print("✅ Loaded successfully") |
| |
|
| | |
| | |
| | |
| | label_map = { |
| | 0: "advertisement", |
| | 1: "budget", |
| | 2: "email", |
| | 3: "file folder", |
| | 4: "form", |
| | 5: "handwritten", |
| | 6: "invoice", |
| | 7: "letter", |
| | 8: "memo", |
| | 9: "news article", |
| | 10: "presentation", |
| | 11: "questionnaire", |
| | 12: "resume", |
| | 13: "scientific publication", |
| | 14: "scientific report", |
| | 15: "specification" |
| | } |
| |
|
| | |
| | |
| | |
| |
|
| | def extract_features(text): |
| | return { |
| | "char_count": len(text), |
| | "digit_count": sum(c.isdigit() for c in text), |
| | "uppercase_count": sum(c.isupper() for c in text), |
| | "currency_count": text.count("$") + text.count("€"), |
| | "line_count": text.count("\n"), |
| | } |
| |
|
| | |
| | |
| | |
| |
|
| | def predict_document(image_path): |
| |
|
| | print(f"\n📄 Running OCR on: {image_path}") |
| |
|
| | img = Image.open(image_path) |
| | text = pytesseract.image_to_string(img) |
| | print("text starts -------------------------------------------") |
| | print(text) |
| | print("text ends -------------------------------------------") |
| |
|
| | |
| | word_features = word_vectorizer.transform([text]) |
| | print("word_features starts -------------------------------------------") |
| | print(word_features) |
| | print("word_features ends -------------------------------------------") |
| | char_features = char_vectorizer.transform([text]) |
| | print("char_features starts -------------------------------------------") |
| | print(char_features) |
| | print("char_features ends -------------------------------------------") |
| | text_features = hstack([word_features, char_features]) |
| | print("text_features starts -------------------------------------------") |
| | print(text_features) |
| | print("text_features ends -------------------------------------------") |
| |
|
| | |
| | numeric_dict = extract_features(text) |
| | print("numeric_dict starts -------------------------------------------") |
| | print(numeric_dict) |
| | print("numeric_dict ends -------------------------------------------") |
| | numeric_df = pd.DataFrame([numeric_dict]) |
| | print("numeric_df starts -------------------------------------------") |
| | print(numeric_df) |
| | print("numeric_df ends -------------------------------------------") |
| | numeric_scaled = scaler.transform(numeric_df) |
| | print("numeric_scaled starts -------------------------------------------") |
| | print(numeric_scaled) |
| | print("numeric_scaled ends -------------------------------------------") |
| | numeric_sparse = csr_matrix(numeric_scaled) |
| | print("numeric_sparse starts -------------------------------------------") |
| | print(numeric_sparse) |
| | print("numeric_sparse ends -------------------------------------------") |
| |
|
| | |
| | final_features = hstack([text_features, numeric_sparse]) |
| | print("final_features starts -------------------------------------------") |
| | print(final_features) |
| | print("final_features ends -------------------------------------------") |
| |
|
| | |
| | prediction = model.predict(final_features)[0] |
| | probability = model.predict_proba(final_features).max() |
| |
|
| | print("\n🎯 Prediction:") |
| | print("Document Type:", label_map[prediction]) |
| | print("Confidence:", round(probability * 100, 2), "%") |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | if __name__ == "__main__": |
| |
|
| | |
| | image_path="/Users/vidyasagarkaruturi/Desktop/Screenshot 2025-10-13 at 9.20.04 AM.png" |
| | predict_document(image_path) |