#!/bin/bash # requires xmllint, which can be installed with the "libxml2" (Fedora) or "libxml2-utils" (Debian) package if (( $# != 1 )); then echo "usage: scrape_video_titles.sh " echo " is a directory filled with HTML files downloaded from a URL starting with 'https://www.zoox18.com/videos'" echo "scrapes all video ID -> title mappings found in the HTML documents and stores them in an 'id_title_map.txt' file of the following format:" echo "|" echo "this can be easily parsed (e.g. with grep) so later you can bulk rename downloaded video files using their titles." echo "the id_title_map.txt file is always appended, never overwritten, is unsorted, and may contain duplicates." exit 1 fi dir=$1 for file in "$dir"/*; do xmllint --html --xpath '//span[@class="video-title title-truncate m-t-5"]/text()' "$file" 2>/dev/null | sed 's/|//g' > titles.txt xmllint --html --xpath '//span[@class="video-title title-truncate m-t-5"]/../@href' "$file" 2>/dev/null | cut -d'/' -f3 > ids.txt paste -d'|' ids.txt titles.txt >> id_title_map.txt done # cleanup rm titles.txt ids.txt