#!/bin/bash
# requires xmllint, which can be installed with the "libxml2" (Fedora) or "libxml2-utils" (Debian) package
if (( $# != 1 )); then
echo "usage: scrape_video_titles.sh
"
echo " is a directory filled with HTML files downloaded from a URL starting with 'https://www.zoox18.com/videos'"
echo "scrapes all video ID -> title mappings found in the HTML documents and stores them in an 'id_title_map.txt' file of the following format:"
echo "|"
echo "this can be easily parsed (e.g. with grep) so later you can bulk rename downloaded video files using their titles."
echo "the id_title_map.txt file is always appended, never overwritten, is unsorted, and may contain duplicates."
exit 1
fi
dir=$1
for file in "$dir"/*; do
xmllint --html --xpath '//span[@class="video-title title-truncate m-t-5"]/text()' "$file" 2>/dev/null | sed 's/|//g' > titles.txt
xmllint --html --xpath '//span[@class="video-title title-truncate m-t-5"]/../@href' "$file" 2>/dev/null | cut -d'/' -f3 > ids.txt
paste -d'|' ids.txt titles.txt >> id_title_map.txt
done
# cleanup
rm titles.txt ids.txt