Spaces:

remdms
/

mediastorm

Sleeping

App Files Files Community

mediastorm / tests /test_parser.py

remdms

feat: add parse_recognition, parse_press_mentions, extract_director, extract_commissioned_by parsers

b8835a7 about 1 month ago

raw

history blame contribute delete

8.85 kB

	from mediastorm.ingest.parser import (
	strip_html,
	parse_transcript,
	extract_embed_codes,
	parse_credits,
	extract_poster_images,
	parse_recognition,
	parse_press_mentions,
	extract_director,
	extract_commissioned_by,
	)


	def test_strip_html_removes_tags():
	html = "<p>Hello <strong>world</strong>.</p>"
	assert strip_html(html) == "Hello world."


	def test_strip_html_handles_empty():
	assert strip_html("") == ""
	assert strip_html(None) == ""


	def test_strip_html_preserves_whitespace():
	html = "<p>First paragraph.</p><p>Second paragraph.</p>"
	result = strip_html(html)
	assert "First paragraph." in result
	assert "Second paragraph." in result


	def test_parse_transcript_extracts_speakers(sample_transcript_html):
	turns = parse_transcript(sample_transcript_html)
	assert len(turns) >= 2
	assert turns[0].speaker == "Phillip Toledano"
	assert "photographing my father" in turns[0].text
	assert turns[1].speaker == "Brian Storm"
	assert "documentary stories" in turns[1].text


	def test_parse_transcript_handles_narration(sample_transcript_html):
	"""Narration without speaker attribution should still be captured."""
	turns = parse_transcript(sample_transcript_html)
	narration = [t for t in turns if t.speaker is None]
	assert len(narration) >= 1
	assert "narration" in narration[0].text.lower()


	def test_parse_transcript_handles_empty():
	assert parse_transcript("") == []
	assert parse_transcript(None) == []


	def test_extract_embed_codes_from_structured_content(sample_structured_content):
	codes = extract_embed_codes(sample_structured_content)
	assert "832l" in codes
	assert "abc1" in codes
	assert len(codes) == 2


	def test_extract_embed_codes_handles_no_embeds():
	content = [{"block_type": "text", "content": "<p>Just text</p>"}]
	assert extract_embed_codes(content) == []


	def test_parse_credits():
	credits_data = [
	{"name": "John Doe", "role": "Director", "type": "individual"},
	{"name": "MediaStorm", "role": "Producer", "type": "organization"},
	]
	credits = parse_credits(credits_data)
	assert len(credits) == 2
	assert credits[0].name == "John Doe"
	assert credits[0].role == "Director"


	def test_extract_poster_images_returns_all_sizes(sample_page_data):
	urls = extract_poster_images(sample_page_data["preview_images"])
	assert urls == [
	"/media/abc123_600.jpg",
	"/media/abc123_900.jpg",
	"/media/abc123_1500.jpg",
	]


	def test_extract_poster_images_handles_missing_poster_frame():
	urls = extract_poster_images({"square": [{"url": "/media/x.jpg"}]})
	assert urls == []


	def test_extract_poster_images_handles_none():
	assert extract_poster_images(None) == []


	def test_extract_poster_images_handles_empty_dict():
	assert extract_poster_images({}) == []


	def test_extract_poster_images_handles_empty_poster_frame_list():
	assert extract_poster_images({"poster-frame": []}) == []


	def test_extract_poster_images_sorts_by_width():
	preview = {
	"poster-frame": [
	{"width": 1500, "height": 844, "url": "/media/h_1500.jpg"},
	{"width": 600, "height": 337, "url": "/media/h_600.jpg"},
	]
	}
	urls = extract_poster_images(preview)
	assert urls == ["/media/h_600.jpg", "/media/h_1500.jpg"]


	# --- parse_recognition ---

	def test_parse_recognition_format1_structured():
	"""Format 1: <strong>Festival</strong> followed by Year:/Place:/Category: lines."""
	structured_content = [
	{"block_type": "heading", "content": "Recognition"},
	{
	"block_type": "text",
	"content": (
	"<p><strong>NPPA's Best of Photojournalism</strong><br>"
	"Year: 2013<br>Place: First<br>"
	"Category: Multimedia Documentary</p>"
	"<p><strong>World Press Photo</strong><br>"
	"Year: 2014<br>Place: Second<br>"
	"Category: Multimedia</p>"
	),
	},
	]
	awards = parse_recognition(structured_content)
	assert len(awards) == 2
	assert awards[0]["festival"] == "NPPA's Best of Photojournalism"
	assert awards[0]["year"] == "2013"
	assert awards[0]["place"] == "First"
	assert awards[0]["category"] == "Multimedia Documentary"
	assert awards[1]["festival"] == "World Press Photo"


	def test_parse_recognition_format2_inline():
	"""Format 2: <strong>Year:</strong> Festival, Place, Category."""
	structured_content = [
	{"block_type": "heading", "content": "Recognition"},
	{
	"block_type": "text",
	"content": (
	'<p><strong>2016:</strong> <a href="#">Pictures of the Year International</a>,'
	" First Place, Multimedia Photographer of the Year</p>"
	'<p><strong>2015:</strong> <a href="#">Emmy Award</a>,'
	" Winner, New Approaches to Documentary</p>"
	),
	},
	]
	awards = parse_recognition(structured_content)
	assert len(awards) == 2
	assert awards[0]["festival"] == "Pictures of the Year International"
	assert awards[0]["year"] == "2016"
	assert awards[0]["place"] == "First Place"
	assert awards[1]["festival"] == "Emmy Award"
	assert awards[1]["year"] == "2015"
	assert awards[1]["place"] == "Winner"


	def test_parse_recognition_no_recognition_heading():
	structured_content = [
	{"block_type": "heading", "content": "Introduction"},
	{"block_type": "text", "content": "<p>Just text.</p>"},
	]
	assert parse_recognition(structured_content) == []


	def test_parse_recognition_empty():
	assert parse_recognition(None) == []
	assert parse_recognition([]) == []


	# --- parse_press_mentions ---

	def test_parse_press_mentions_extracts_links():
	structured_content = [
	{"block_type": "heading", "content": "Press"},
	{
	"block_type": "text",
	"content": (
	'<p><a href="https://nytimes.com/article">The New York Times</a></p>'
	'<p><a href="https://washpost.com/story">The Washington Post</a></p>'
	),
	},
	]
	mentions = parse_press_mentions(structured_content)
	assert len(mentions) == 2
	assert mentions[0]["name"] == "The New York Times"
	assert mentions[0]["url"] == "https://nytimes.com/article"
	assert mentions[1]["name"] == "The Washington Post"


	def test_parse_press_mentions_no_press_heading():
	structured_content = [
	{"block_type": "heading", "content": "Credits"},
	{"block_type": "text", "content": "<p>Some text.</p>"},
	]
	assert parse_press_mentions(structured_content) == []


	def test_parse_press_mentions_empty():
	assert parse_press_mentions(None) == []
	assert parse_press_mentions([]) == []


	# --- extract_director ---

	def test_extract_director_finds_director_role():
	credits = [
	{"name": "Alice Smith", "role": "Photography", "type": "individual"},
	{"name": "Tim McLaughlin", "role": "Director & Editor", "type": "individual"},
	{"name": "MediaStorm", "role": "Producer", "type": "organization"},
	]
	assert extract_director(credits) == "Tim McLaughlin"


	def test_extract_director_case_insensitive():
	credits = [{"name": "Jane Doe", "role": "director", "type": "individual"}]
	assert extract_director(credits) == "Jane Doe"


	def test_extract_director_no_director():
	credits = [{"name": "Alice", "role": "Photography", "type": "individual"}]
	assert extract_director(credits) == ""


	def test_extract_director_empty():
	assert extract_director(None) == ""
	assert extract_director([]) == ""


	# --- extract_commissioned_by ---

	def test_extract_commissioned_by_from_credits():
	credits = [
	{"name": "Yale Environment 360", "role": "Commissioned By", "type": "organization"},
	]
	assert extract_commissioned_by(credits, "Some Story") == "Yale Environment 360"


	def test_extract_commissioned_by_from_name_pattern():
	credits = [{"name": "Alice", "role": "Director", "type": "individual"}]
	result = extract_commissioned_by(credits, "Leveling Appalachia for Yale Environment 360")
	assert result == "Yale Environment 360"


	def test_extract_commissioned_by_credits_takes_priority():
	credits = [
	{"name": "UNICEF", "role": "Commissioned by", "type": "organization"},
	]
	result = extract_commissioned_by(credits, "Story for Some Client")
	assert result == "UNICEF"


	def test_extract_commissioned_by_none():
	credits = [{"name": "Alice", "role": "Director", "type": "individual"}]
	assert extract_commissioned_by(credits, "Simple Title") == ""


	def test_extract_commissioned_by_empty():
	assert extract_commissioned_by(None, "") == ""
	assert extract_commissioned_by([], "") == ""