Skip to content

Instantly share code, notes, and snippets.

@runo280
Created August 20, 2020 15:48
Show Gist options
  • Save runo280/750e10c5f474f6d972eef85df8196cc8 to your computer and use it in GitHub Desktop.
Save runo280/750e10c5f474f6d972eef85df8196cc8 to your computer and use it in GitHub Desktop.
c@ster
package io.github.runo280;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.net.URL;
public class Main {
public static void main(String[] args) throws IOException {
Document document = Jsoup.parse(new URL("https://caster.io/courses/android-machine-learning-with-tensorflow-lite-and-tf-keras"), 20 * 1000);
Element container = document.selectFirst("div[course-id]");
Elements links = container.getElementsByTag("a");
int index = 0;
for (Element e : links) {
index++;
String fileName = index + "- " + e.selectFirst("a").text().replaceAll("\\s\\d\\d\\:\\d\\d\\s.*$", "") + ".mp4";
String link = e.attr("href");
System.out.println(String.format("youtube-dl --config-location casterconf \"%s\" -o \"%s\"", link, fileName));
}
}
}
/*
* This Java source file was generated by the Gradle 'init' task.
*/
package io.runo280.casterdl;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.net.URL;
public class App {
public static final String COURSES_URL = "https://caster.io/courses";
static String BASH_HEADER = "#!/usr/bin/env bash";
/*static String OPTIONS = "--ignore-config -f hd_mp4_video-1 --cookies cookie -i -c --external-downloader aria2c " +
"--external-downloader-args \"-c -s16 -k1M -x16 --enable-color=true --human-readable=true\" " +
"--user-agent \"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:60.0) Gecko/20100101 Firefox/60.0\"";*/
public static void main(String args[]) throws IOException {
Document coursesPage = getPage(COURSES_URL);
Elements courseList = coursesPage.select("a.cioc-cardgroup__item");
for (Element c : courseList) {
String link = "https://caster.io" + c.attr("href");
System.out.println(link);
String name = c.selectFirst("span.cioc-link--yellow").text().replaceAll(" ", "_");
System.out.println(name + "\n\n");
createDl(name, link);
}
}
private static Document getPage(String url) throws IOException {
return Jsoup.parse(new URL(url), 20 * 1000);
}
static void createDl(String name, String url) throws IOException {
Document document = getPage(url);
File file = new File("dl_" + FileUtils.sanitizeFilename(name) + ".sh");
FileWriter fr = new FileWriter(file, true);
fr.write(BASH_HEADER);
fr.write("\nmkdir " + name + "\n");
Element container = document.selectFirst("div[course-id]");
Elements links = container.getElementsByTag("a");
int index = 0;
for (Element e : links) {
index++;
String fileName = index + "- " + e.selectFirst("a").text().replaceAll("\\s\\d\\d\\:\\d\\d\\s.*$", "") + ".mp4";
fileName = FileUtils.sanitizeFilename(fileName);
String link = e.attr("href");
String line = String.format("\nyoutube-dl %s \"%s\" -o \"%s\\%s\"", getArgs(), link, name, fileName);
fr.write(line);
}
fr.close();
}
static String getArgs(){
StringBuilder sb = new StringBuilder();
sb.append("--ignore-config ");
sb.append("--cookies cookie ");
sb.append("--external-downloader aria2c ");
sb.append("--user-agent \"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:60.0) Gecko/20100101 Firefox/60.0\" ");
sb.append("-i -c ");
sb.append("-f \"(worst[width>=1080])[protocol^=http]/(worst[width>=720])[protocol^=http]\" ");
sb.append("--external-downloader-args \"-c -s16 -k1M -x16 --enable-color=true --human-readable=true\" ");
return sb.toString();
}
}
import re
import string
import requests
from bs4 import BeautifulSoup
'''
format code extension resolution note
mp4-224p mp4 400x224 182k , mp4 container, h264, 1.63MiB
iphone-360p mp4 640x360 227k , mp4 container, h264, 2.03MiB
md_mp4-540p mp4 960x540 295k , mp4 container, h264, 2.63MiB
hd_mp4-720p mp4 1280x720 361k , mp4 container, h264, 3.22MiB
hd_mp4-1080p mp4 1920x1080 526k , mp4 container, h264, 4.68MiB
original bin 1920x1080 2660k , 23.68MiB (best)
'''
def format_filename(s):
"""Take a string and return a valid filename constructed from the string.
Uses a whitelist approach: any characters not present in valid_chars are
removed. Also spaces are replaced with underscores.
Note: this method may produce invalid filenames such as ``, `.` or `..`
When I use this method I prepend a date string like '2009_01_15_19_46_32_'
and append a file extension like '.txt', so I avoid the potential of using
an invalid filename.
"""
valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits)
filename = ''.join(c for c in s if c in valid_chars)
filename = filename.replace(' ', '_') # I don't like spaces in filenames.
return filename
course_url = 'https://caster.io/courses/kotlin-programming-language'
request = requests.get(course_url)
source = BeautifulSoup(request.text, 'html.parser')
course = source.find_all('div', {'course-id': re.compile(r'.*')})
index = 0
for r in course:
lesson = r.find_all('a')
print(len(lesson))
for item in lesson:
index += 1
file_name = item.text.strip().partition('\n')[0]
file_name = format_filename(file_name) + '.mp4'
file_name = file_name.replace('_-_', '_')
file_name = re.sub(r'\d+\.', '', file_name)
file_name = f'{index:03d}' + file_name
video_format = 'hd_mp4-1080p'
url = item['href']
command = f'youtube-dl --ignore-config -f {video_format} -i -c --external-downloader aria2c ' \
f'--external-downloader-args "-c -s16 -k1M -x16 --enable-color=true --human-readable=true" ' \
f'--user-agent "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:60.0) Gecko/20100101 Firefox/60.0" "{url}" ' \
f'-o "{file_name}" '
print(command)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment