Spaces:

fizzarif7
/

docVer

Sleeping

App Files Files Community

docVer / app.py

fizzarif7

Update app.py

1b9165b verified 6 months ago

raw

history blame contribute delete

8.12 kB


	import streamlit as st
	from transformers import pipeline,AutoModelForSequenceClassification, AutoTokenizer
	import pdfplumber
	import docx
	from PIL import Image

	from textblob import TextBlob
	import re
	import fitz
	import pytesseract



	# ------------------------
	# Hugging Face Model




	tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-mnli")
	model = AutoModelForSequenceClassification.from_pretrained("facebook/bart-large-mnli")

	classifier = pipeline(
	"zero-shot-classification",
	model=model,
	tokenizer=tokenizer,
	device=-1
	)


	# ------------------------
	# Extraction Functions
	# ------------------------
	def extract_text_from_pdf(file_path):
	text = ""
	with pdfplumber.open(file_path) as pdf:
	for page in pdf.pages:
	page_text = page.extract_text()
	if page_text:
	text += page_text + "\n"

	if not text.strip(): # OCR fallback
	ocr_text = ""
	doc = fitz.open(file_path)
	for page_num in range(len(doc)):
	page = doc[page_num]
	pix = page.get_pixmap()
	img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
	ocr_text += pytesseract.image_to_string(img) + "\n"
	text = ocr_text
	return text.strip()

	def extract_text_from_docx(file_path):
	doc = docx.Document(file_path)
	return "\n".join([p.text for p in doc.paragraphs]).strip()

	def extract_text_from_image(file_path):
	return pytesseract.image_to_string(Image.open(file_path)).strip()

	def check_grammar(text):
	blob = TextBlob(text)
	corrected_text = str(blob.correct())
	return corrected_text != text

	def extract_dates(text):
	date_patterns = [
	r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b',
	r'\b\d{1,2}\.\d{1,2}\.\d{2,4}\b',
	r'\b\d{1,2}(?:st\|nd\|rd\|th)?\s+\w+\s,?\s\d{2,4}\b',
	r'\b\w+\s+\d{1,2},\s*\d{4}\b',
	]
	dates_found = []
	for pattern in date_patterns:
	matches = re.findall(pattern, text, flags=re.IGNORECASE)
	dates_found.extend(matches)
	return list(set(dates_found))

	def classify_dates(text, dates):
	issue_keywords = ["issued on", "dated", "notified on", "circular no"]
	event_keywords = ["holiday", "observed on", "exam on", "will be held on", "effective from"]

	issue_dates, event_dates = [], []
	for d in dates:
	idx = text.lower().find(d.lower())
	if idx != -1:
	context = text[max(0, idx-60): idx+60].lower()
	if any(k in context for k in issue_keywords):
	issue_dates.append(d)
	elif any(k in context for k in event_keywords):
	after_text = text[idx: idx+80]
	match = re.search(rf"{re.escape(d)}[^\n]*", after_text)
	event_dates.append(match.group().strip() if match else d)

	if not issue_dates and dates:
	issue_dates.append(dates[0])
	return issue_dates, event_dates

	# ------------------------
	# Verification Logic
	# ------------------------
	def verify_text(text, source_type="TEXT"):
	if not text.strip():
	return "--- Evidence Report ---\n\n❌ No readable text provided."

	# ------------------------
	# Heuristic Checks
	# ------------------------
	grammar_issue = check_grammar(text)
	dates = extract_dates(text)
	issue_dates, event_dates = classify_dates(text, dates)

	# Scam / fake indicators
	scam_keywords = [
	"bank details", "send money", "lottery", "win prize",
	"transfer fee", "urgent", "click here", "claim", "scholarship $"
	]
	scam_detected = any(kw in text.lower() for kw in scam_keywords)

	# Date consistency check
	contradiction = False
	if issue_dates and event_dates:
	try:
	from datetime import datetime
	fmt_variants = ["%d/%m/%Y", "%d-%m-%Y", "%d.%m.%Y", "%d %B %Y", "%B %d, %Y"]

	def parse_date(d):
	for fmt in fmt_variants:
	try:
	return datetime.strptime(d, fmt)
	except Exception:
	continue
	return None

	parsed_issue = parse_date(issue_dates[0])
	parsed_event = parse_date(event_dates[0])
	if parsed_issue and parsed_event and parsed_event < parsed_issue:
	contradiction = True
	except Exception:
	pass

	# ------------------------
	# Hugging Face Model
	# ------------------------
	labels = ["REAL", "FAKE"]
	result = classifier(text[:1000], candidate_labels=labels)
	model_label = result['labels'][0]
	model_confidence = result['scores'][0]

	# ------------------------
	# Final Verdict Logic
	# ------------------------
	final_label = model_label
	if scam_detected or contradiction or grammar_issue:
	# downgrade to FAKE if red flags appear
	final_label = "FAKE"

	# ------------------------
	# Report
	# ------------------------
	report = "📄 Evidence Report\n\n"
	report += "🔎 Document Analysis\n\n"
	report += f"Source: {source_type}\n\n"

	report += "✅ Evidence Considered\n\n"
	if grammar_issue:
	report += "⚠️ Grammar/Spelling issues detected.\n"
	else:
	report += "No grammar issues detected.\n"

	if issue_dates:
	report += f"📌 Issue Date(s): {', '.join(issue_dates)}\n"
	if event_dates:
	report += f"📌 Event Date(s): {', '.join(event_dates)}\n"
	if not dates:
	report += "No specific dates detected.\n"

	if contradiction:
	report += "⚠️ Date inconsistency detected (event before issue date).\n"
	if scam_detected:
	report += "⚠️ Scam-related keywords detected.\n"

	report += "\nFormatting and tone analyzed.\n\n"
	report += "🏁 Classification Result\n\n"
	report += f"Model Verdict: {model_label} ({model_confidence:.2f})\n"
	report += f"Final Verdict: {final_label}\n"

	return report

	import tempfile
	import os

	def verify_document(file):
	if file is None:
	return "❌ Please upload a file or provide a file path."

	# Case 1: If input is a string (direct file path)
	if isinstance(file, str):
	file_path = file

	# Case 2: If input is an uploaded file (Streamlit/Colab)
	else:
	# Save to a temporary file
	suffix = os.path.splitext(file.name)[-1]
	with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
	tmp.write(file.read())
	file_path = tmp.name

	# Detect file type and extract
	ext = file_path.split('.')[-1].lower()
	if ext == "pdf":
	text = extract_text_from_pdf(file_path)
	elif ext == "docx":
	text = extract_text_from_docx(file_path)
	elif ext in ["png", "jpg", "jpeg"]:
	text = extract_text_from_image(file_path)
	else:
	return "❌ Unsupported file type."

	return verify_text(text, source_type=ext.upper())



	def process_input(file, manual_text):
	if file is not None:
	return verify_document(file)
	elif manual_text.strip():
	return verify_text(manual_text, source_type="MANUAL TEXT")
	else:
	return "❌ Please upload a document or paste text first."

	# ------------------------
	# Streamlit UI
	# ------------------------
	# ------------------------
	# Streamlit UI
	# ------------------------
	st.set_page_config(page_title="Document Verifier", layout="centered")
	st.title("📑 Document Authenticity Verifier")

	uploaded_file = st.file_uploader(
	"Upload a document (PDF, DOCX, PNG, JPG)",
	type=["pdf", "docx", "png", "jpg", "jpeg"]
	)
	manual_text = st.text_area("Or paste text manually")

	# Button for uploaded files
	if st.button("Verify Uploaded Document"):
	with st.spinner("Analyzing uploaded document..."):
	result = process_input(uploaded_file, "")
	st.text_area("Evidence Report", value=result, height=400)

	# Button for manual text
	if st.button("Verify Manual Text"):
	with st.spinner("Analyzing manual text..."):
	result = process_input(None, manual_text)
	st.text_area("Evidence Report", value=result, height=400)