mediastorm / tests /test_parser.py
remdms's picture
feat: add parse_recognition, parse_press_mentions, extract_director, extract_commissioned_by parsers
b8835a7
from mediastorm.ingest.parser import (
strip_html,
parse_transcript,
extract_embed_codes,
parse_credits,
extract_poster_images,
parse_recognition,
parse_press_mentions,
extract_director,
extract_commissioned_by,
)
def test_strip_html_removes_tags():
html = "<p>Hello <strong>world</strong>.</p>"
assert strip_html(html) == "Hello world."
def test_strip_html_handles_empty():
assert strip_html("") == ""
assert strip_html(None) == ""
def test_strip_html_preserves_whitespace():
html = "<p>First paragraph.</p><p>Second paragraph.</p>"
result = strip_html(html)
assert "First paragraph." in result
assert "Second paragraph." in result
def test_parse_transcript_extracts_speakers(sample_transcript_html):
turns = parse_transcript(sample_transcript_html)
assert len(turns) >= 2
assert turns[0].speaker == "Phillip Toledano"
assert "photographing my father" in turns[0].text
assert turns[1].speaker == "Brian Storm"
assert "documentary stories" in turns[1].text
def test_parse_transcript_handles_narration(sample_transcript_html):
"""Narration without speaker attribution should still be captured."""
turns = parse_transcript(sample_transcript_html)
narration = [t for t in turns if t.speaker is None]
assert len(narration) >= 1
assert "narration" in narration[0].text.lower()
def test_parse_transcript_handles_empty():
assert parse_transcript("") == []
assert parse_transcript(None) == []
def test_extract_embed_codes_from_structured_content(sample_structured_content):
codes = extract_embed_codes(sample_structured_content)
assert "832l" in codes
assert "abc1" in codes
assert len(codes) == 2
def test_extract_embed_codes_handles_no_embeds():
content = [{"block_type": "text", "content": "<p>Just text</p>"}]
assert extract_embed_codes(content) == []
def test_parse_credits():
credits_data = [
{"name": "John Doe", "role": "Director", "type": "individual"},
{"name": "MediaStorm", "role": "Producer", "type": "organization"},
]
credits = parse_credits(credits_data)
assert len(credits) == 2
assert credits[0].name == "John Doe"
assert credits[0].role == "Director"
def test_extract_poster_images_returns_all_sizes(sample_page_data):
urls = extract_poster_images(sample_page_data["preview_images"])
assert urls == [
"/media/abc123_600.jpg",
"/media/abc123_900.jpg",
"/media/abc123_1500.jpg",
]
def test_extract_poster_images_handles_missing_poster_frame():
urls = extract_poster_images({"square": [{"url": "/media/x.jpg"}]})
assert urls == []
def test_extract_poster_images_handles_none():
assert extract_poster_images(None) == []
def test_extract_poster_images_handles_empty_dict():
assert extract_poster_images({}) == []
def test_extract_poster_images_handles_empty_poster_frame_list():
assert extract_poster_images({"poster-frame": []}) == []
def test_extract_poster_images_sorts_by_width():
preview = {
"poster-frame": [
{"width": 1500, "height": 844, "url": "/media/h_1500.jpg"},
{"width": 600, "height": 337, "url": "/media/h_600.jpg"},
]
}
urls = extract_poster_images(preview)
assert urls == ["/media/h_600.jpg", "/media/h_1500.jpg"]
# --- parse_recognition ---
def test_parse_recognition_format1_structured():
"""Format 1: <strong>Festival</strong> followed by Year:/Place:/Category: lines."""
structured_content = [
{"block_type": "heading", "content": "Recognition"},
{
"block_type": "text",
"content": (
"<p><strong>NPPA's Best of Photojournalism</strong><br>"
"Year: 2013<br>Place: First<br>"
"Category: Multimedia Documentary</p>"
"<p><strong>World Press Photo</strong><br>"
"Year: 2014<br>Place: Second<br>"
"Category: Multimedia</p>"
),
},
]
awards = parse_recognition(structured_content)
assert len(awards) == 2
assert awards[0]["festival"] == "NPPA's Best of Photojournalism"
assert awards[0]["year"] == "2013"
assert awards[0]["place"] == "First"
assert awards[0]["category"] == "Multimedia Documentary"
assert awards[1]["festival"] == "World Press Photo"
def test_parse_recognition_format2_inline():
"""Format 2: <strong>Year:</strong> Festival, Place, Category."""
structured_content = [
{"block_type": "heading", "content": "Recognition"},
{
"block_type": "text",
"content": (
'<p><strong>2016:</strong> <a href="#">Pictures of the Year International</a>,'
" First Place, Multimedia Photographer of the Year</p>"
'<p><strong>2015:</strong> <a href="#">Emmy Award</a>,'
" Winner, New Approaches to Documentary</p>"
),
},
]
awards = parse_recognition(structured_content)
assert len(awards) == 2
assert awards[0]["festival"] == "Pictures of the Year International"
assert awards[0]["year"] == "2016"
assert awards[0]["place"] == "First Place"
assert awards[1]["festival"] == "Emmy Award"
assert awards[1]["year"] == "2015"
assert awards[1]["place"] == "Winner"
def test_parse_recognition_no_recognition_heading():
structured_content = [
{"block_type": "heading", "content": "Introduction"},
{"block_type": "text", "content": "<p>Just text.</p>"},
]
assert parse_recognition(structured_content) == []
def test_parse_recognition_empty():
assert parse_recognition(None) == []
assert parse_recognition([]) == []
# --- parse_press_mentions ---
def test_parse_press_mentions_extracts_links():
structured_content = [
{"block_type": "heading", "content": "Press"},
{
"block_type": "text",
"content": (
'<p><a href="https://nytimes.com/article">The New York Times</a></p>'
'<p><a href="https://washpost.com/story">The Washington Post</a></p>'
),
},
]
mentions = parse_press_mentions(structured_content)
assert len(mentions) == 2
assert mentions[0]["name"] == "The New York Times"
assert mentions[0]["url"] == "https://nytimes.com/article"
assert mentions[1]["name"] == "The Washington Post"
def test_parse_press_mentions_no_press_heading():
structured_content = [
{"block_type": "heading", "content": "Credits"},
{"block_type": "text", "content": "<p>Some text.</p>"},
]
assert parse_press_mentions(structured_content) == []
def test_parse_press_mentions_empty():
assert parse_press_mentions(None) == []
assert parse_press_mentions([]) == []
# --- extract_director ---
def test_extract_director_finds_director_role():
credits = [
{"name": "Alice Smith", "role": "Photography", "type": "individual"},
{"name": "Tim McLaughlin", "role": "Director & Editor", "type": "individual"},
{"name": "MediaStorm", "role": "Producer", "type": "organization"},
]
assert extract_director(credits) == "Tim McLaughlin"
def test_extract_director_case_insensitive():
credits = [{"name": "Jane Doe", "role": "director", "type": "individual"}]
assert extract_director(credits) == "Jane Doe"
def test_extract_director_no_director():
credits = [{"name": "Alice", "role": "Photography", "type": "individual"}]
assert extract_director(credits) == ""
def test_extract_director_empty():
assert extract_director(None) == ""
assert extract_director([]) == ""
# --- extract_commissioned_by ---
def test_extract_commissioned_by_from_credits():
credits = [
{"name": "Yale Environment 360", "role": "Commissioned By", "type": "organization"},
]
assert extract_commissioned_by(credits, "Some Story") == "Yale Environment 360"
def test_extract_commissioned_by_from_name_pattern():
credits = [{"name": "Alice", "role": "Director", "type": "individual"}]
result = extract_commissioned_by(credits, "Leveling Appalachia for Yale Environment 360")
assert result == "Yale Environment 360"
def test_extract_commissioned_by_credits_takes_priority():
credits = [
{"name": "UNICEF", "role": "Commissioned by", "type": "organization"},
]
result = extract_commissioned_by(credits, "Story for Some Client")
assert result == "UNICEF"
def test_extract_commissioned_by_none():
credits = [{"name": "Alice", "role": "Director", "type": "individual"}]
assert extract_commissioned_by(credits, "Simple Title") == ""
def test_extract_commissioned_by_empty():
assert extract_commissioned_by(None, "") == ""
assert extract_commissioned_by([], "") == ""