Created
April 29, 2020 03:43
-
-
Save cgarz/d79b27fcf14f74838cee55eb303542b6 to your computer and use it in GitHub Desktop.
Youtube watch later stats. Gets statistics and video links from your watch later playlist using python3, requests, BeautifulSoup and browser_cookie3 for authentication.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# adapted from https://github.com/bulbipop/ytwlstats/blob/master/cmd.py | |
from bs4 import BeautifulSoup as bs | |
from datetime import datetime, timedelta | |
from argparse import ArgumentParser | |
from filecmp import cmp as compare | |
import requests | |
import browser_cookie3 | |
import sys | |
import os | |
# PARSER = 'html.parser' | |
PARSER = 'lxml' | |
YT_PREFIX = 'http://www.youtube.com' | |
LINK_LIST_OUT = 'YT_WL_links.txt' | |
AUTO_NAME_PFX = 'YT_WL_stats_' | |
AUTO_NAME_DIR = 'stats' | |
# https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes | |
TIMESTAMP_FMT = '%Y-%m-%d_%H-%M-%S' | |
S_INDEX = 'Index....: ' | |
S_TITLE = 'Title....: ' | |
S_LINK = 'Link.....: ' | |
S_UPLOADER = 'Uploader.: ' | |
S_DURATION = 'Duration.: ' | |
S_NO_DATA = 'N/A' | |
def req(url, cookie_jar): | |
print('Downloading ' + url) | |
return requests.get(url, cookies=cookie_jar) | |
def calc_durations(length, speeds=(1, 1.5, 2, 3)): | |
""" Generator to calculate length at different speeds """ | |
for speed in speeds: | |
new_play_len = length / speed | |
ms = new_play_len.microseconds | |
new_play_len -= timedelta(microseconds=ms) # rounds to second | |
yield new_play_len, speed | |
def scrape_stats(): | |
cj = browser_cookie3.firefox(domain_name='youtube.com') | |
url = YT_PREFIX + '/playlist?list=WL' | |
html = req(url, cj).text | |
button = bs(html, PARSER).find(class_='load-more-button') | |
# Separate runs needed. Thus saving ajax calls to html_extra. Appending ajax to html would apparently corrupt the | |
# html's structure, causing BeautifulSoup to fail to find pl-video class for all videos after the load more button | |
html_extra = '' | |
while button: | |
url = YT_PREFIX + button['data-uix-load-more-href'] | |
ajax = req(url, cj).json() | |
html_extra += ajax['content_html'] | |
button = bs(ajax['load_more_widget_html'], PARSER).button | |
soup_main = bs(html, PARSER) | |
soup_extra = bs(html_extra, PARSER) if html_extra else '' | |
vid_entries = [] | |
total = timedelta() | |
count = 0 | |
for soup in (soup_main, soup_extra): | |
if soup: | |
for video in soup.find_all(class_='pl-video'): | |
count += 1 | |
vid_entry = {'index': str(count)} | |
for link in video.find_all(class_='pl-video-title-link'): | |
vid_entry['title'] = link.text.strip() | |
vid_entry['link'] = YT_PREFIX + link['href'].split('&')[0] | |
break | |
else: | |
print('WARNING: No link or video class "pl-title-link" in "pl-video". Skipping.') | |
continue | |
for owner in video.find_all(class_='pl-video-owner'): | |
vid_entry['uploader'] = owner.a.text.strip() | |
break | |
else: | |
vid_entry['uploader'] = S_NO_DATA | |
for timestamp in video.find_all(class_='timestamp'): | |
vid_entry['duration'] = timestamp.text.strip() | |
*h, m, s = list(map(int, timestamp.text.strip().split(':'))) | |
total += timedelta(hours=next(iter(h), 0), minutes=m, seconds=s) | |
break | |
else: | |
vid_entry['duration'] = S_NO_DATA | |
vid_entries.append(vid_entry) | |
return vid_entries, total | |
def main(): | |
script_dir = os.path.dirname(os.path.abspath(__file__)) | |
os.chdir(script_dir) | |
parser = ArgumentParser(description='Scrapes various information from videos in a users YouTube watch later list.') | |
parser.add_argument('-f', '--clobber', action='store_true', | |
help='Force overwriting (clobbering) of output files. Default is to abort if file exists') | |
parser.add_argument('-o', '--out-file', nargs='?', type=str, default=None, | |
help='Output file to print all video stats to. Defaults to stdout.') | |
parser.add_argument('-a', '--auto-name', action='store_true', | |
help=(f'Set output filename and path automatically with format: ' | |
f'"[SCRIPT_DIR]/{AUTO_NAME_DIR}/{AUTO_NAME_PFX}{TIMESTAMP_FMT.replace("%", "%%")}.txt". ' | |
'Will also remove output file if last auto named file is identical.')) | |
args = parser.parse_args() | |
if args.auto_name: | |
if args.out_file: | |
parser.error('Cannot specify output file with auto output file naming.') | |
auto_name = f'{AUTO_NAME_DIR}{os.path.sep}{AUTO_NAME_PFX}{datetime.strftime(datetime.now(), TIMESTAMP_FMT)}.txt' | |
print('Auto name mode. Setting output filename to:', auto_name) | |
args.out_file = auto_name | |
if not os.path.isdir(AUTO_NAME_DIR): | |
os.mkdir(AUTO_NAME_DIR) | |
if not args.out_file or args.out_file == '-': | |
args.out_file = sys.stdout | |
elif not args.clobber and os.path.isfile(args.out_file): | |
parser.error('Output comment file exists. To force overwrite, specify --clobber (-f)') | |
else: | |
args.out_file = open(args.out_file, 'w') | |
print('Beginning WL scrape.') | |
vid_entries, total_playtime = scrape_stats() | |
print() | |
playtime_stats = [f'{len(vid_entries)} videos with a total duration of:'] | |
for new_play_len, speed in calc_durations(total_playtime): | |
playtime_stats.append(f'{new_play_len} at {speed}x') | |
print('Writing video links to:', LINK_LIST_OUT) | |
with open(LINK_LIST_OUT, 'w') as f: | |
f.write('\n'.join([vid['link'] for vid in vid_entries]) + '\n') | |
print('Done.') | |
print() | |
if args.out_file != sys.stdout: | |
print('Writing videos stats to:', args.out_file.name) | |
for vid_entry in vid_entries: | |
args.out_file.write(S_INDEX + vid_entry['index'] + '\n') | |
args.out_file.write(S_TITLE + vid_entry['title'] + '\n') | |
args.out_file.write(S_LINK + vid_entry['link'] + '\n') | |
args.out_file.write(S_UPLOADER + vid_entry['uploader'] + '\n') | |
args.out_file.write(S_DURATION + vid_entry['duration'] + '\n') | |
args.out_file.write('\n') | |
args.out_file.write('\n'.join(playtime_stats) + '\n') | |
if args.out_file != sys.stdout: | |
print('Done.') | |
args.out_file.close() | |
print() | |
if args.auto_name: | |
stat_files = [f for f in os.listdir(AUTO_NAME_DIR) if f.startswith(AUTO_NAME_PFX) | |
and f.endswith('.txt') and not f == os.path.basename(auto_name)] | |
stat_files.sort() | |
last_stat_file = os.path.join(AUTO_NAME_DIR, stat_files[-1]) | |
if compare(last_stat_file, auto_name, shallow=False): | |
print('Auto named file identical to previous.', '\nPrevious:', last_stat_file) | |
print('Removing:', auto_name) | |
os.remove(auto_name) | |
print() | |
if args.out_file != sys.stdout: | |
print(' '.join(playtime_stats[:2])) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment