Spaces:

tacab
/

ASR2025

Sleeping

App Files Files Community

ASR2025 / main.py

nurfarah57

Update main.py

6136d9d verified about 2 months ago

raw

history blame contribute delete

1.87 kB

	from fastapi import FastAPI, UploadFile, File
	from fastapi.responses import JSONResponse
	from transformers import pipeline
	import traceback
	import re
	import uvicorn
	import torchaudio

	# ✅ Force torchaudio to use the soundfile backend
	torchaudio.set_audio_backend("soundfile")

	app = FastAPI(title="Tacab ASR Somali API")

	# Load ASR model
	asr = pipeline(
	"automatic-speech-recognition",
	model="tacab/ASR_SOMALI",
	tokenizer="tacab/ASR_SOMALI",
	chunk_length_s=30,
	stride_length_s=6,
	return_timestamps="word",
	device=-1
	)

	# Auto punctuation
	def auto_punctuate(text):
	text = text.strip()

	def capitalize_sentences(text):
	sentences = re.split(r'(?<=[.?!])\s+', text)
	return '. '.join(s.strip().capitalize() for s in sentences if s)

	if '.' not in text and len(text.split()) > 5:
	text += '.'

	words = text.split()
	new_text = ""
	for i in range(0, len(words), 10):
	segment = " ".join(words[i:i+10])
	new_text += segment.strip().capitalize() + ". "

	return capitalize_sentences(new_text.strip())

	@app.post("/transcribe", tags=["ASR"], operation_id="generate")
	async def transcribe(file: UploadFile = File(...)):
	try:
	temp_path = f"/tmp/{file.filename}"
	with open(temp_path, "wb") as f:
	f.write(await file.read())

	result = asr(temp_path)
	raw_text = result.get("text", "").strip()
	if not raw_text:
	return JSONResponse({"error": "No transcription result."}, status_code=400)

	cleaned_text = auto_punctuate(raw_text)
	# return {"transcription": cleaned_text}
	return {"text": cleaned_text}

	except Exception as e:
	traceback.print_exc()
	return JSONResponse({"error": str(e)}, status_code=500)

	if __name__ == "__main__":
	uvicorn.run(app, host="0.0.0.0", port=7860)