#!/bin/bash

# requires xmllint, which can be installed with the "libxml2" (Fedora) or "libxml2-utils" (Debian) package

if (( $# != 1 )); then
    echo "usage: scrape_video_titles.sh <dir>"
    echo "<dir> is a directory filled with HTML files downloaded from a URL starting with 'https://www.zoox18.com/videos'"
    echo "scrapes all video ID -> title mappings found in the HTML documents and stores them in an 'id_title_map.txt' file of the following format:"
    echo "<id>|<title>"
    echo "this can be easily parsed (e.g. with grep) so later you can bulk rename downloaded video files using their titles."
    echo "the id_title_map.txt file is always appended, never overwritten, is unsorted, and may contain duplicates."
    exit 1
fi

dir=$1

for file in "$dir"/*; do
    xmllint --html --xpath '//span[@class="video-title title-truncate m-t-5"]/text()' "$file" 2>/dev/null | sed 's/|//g' > titles.txt
    xmllint --html --xpath '//span[@class="video-title title-truncate m-t-5"]/../@href' "$file" 2>/dev/null | cut -d'/' -f3 > ids.txt
    paste -d'|' ids.txt titles.txt >> id_title_map.txt
done

# cleanup
rm titles.txt ids.txt