Last active
September 18, 2024 11:04
-
-
Save JustSimplyKyle/5ad650d87c2e82a6ba3ece2fbf440f05 to your computer and use it in GitHub Desktop.
A scraper that scrapes `manhuagui.com`, also has the ability to translate to epub
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use std::io::Write; | |
use std::process::Command; | |
use std::time::Duration; | |
use argh::FromArgs; | |
use bytes::BytesMut; | |
use color_eyre::eyre::{bail, ContextCompat}; | |
use fantoccini::{ClientBuilder, Locator}; | |
use reqwest::header::{ | |
HeaderMap, HeaderValue, ACCEPT, ACCEPT_ENCODING, ACCEPT_LANGUAGE, CONNECTION, REFERER, | |
USER_AGENT, | |
}; | |
use reqwest::Client; | |
use tokio::fs::create_dir_all; | |
use tokio_stream::StreamExt; | |
#[derive(FromArgs, Clone)] | |
// "https://www.manhuagui.com/comic/33860/725114.html"; | |
/// Web scraper to download manhuagui things | |
struct Arguments { | |
/// if you don't want to download, only wanting to translate already downloaded files | |
#[argh(option)] | |
only_translate_to_epub: Option<String>, | |
/// url | |
#[argh(option, short = 'i')] | |
url: Option<String>, | |
/// to launch the chrome webdriver headlessly or not | |
#[argh(switch)] | |
headless: bool, | |
/// port | |
#[argh(option)] | |
port: Option<i64>, | |
} | |
#[tokio::main] | |
async fn main() -> color_eyre::Result<()> { | |
color_eyre::install()?; | |
let args: Arguments = argh::from_env(); | |
spawn_chromedriver(&args)?; | |
if let Some(x) = args.only_translate_to_epub { | |
to_epub(&x).await?; | |
} else { | |
if let Some(url) = args.url.clone() { | |
let anime = download(&args, &url).await?; | |
to_epub(&anime).await?; | |
} else { | |
bail!("You HAVE to provide url"); | |
} | |
} | |
Ok(()) | |
} | |
fn spawn_chromedriver(args: &Arguments) -> color_eyre::Result<()> { | |
let headless = if args.headless { "--headless" } else { "" }; | |
let port = if let Some(port) = args.port { | |
format!("--port={}", port.to_string()) | |
} else { | |
String::new() | |
}; | |
Command::new(format!("chromedriver")) | |
.arg(headless) | |
.arg(port) | |
.spawn()?; | |
Ok(()) | |
} | |
async fn to_epub(anime: &str) -> color_eyre::Result<()> { | |
let mut dir = tokio::fs::read_dir(&anime).await?; | |
while let Some(entry) = dir.next_entry().await? { | |
let chapter = entry | |
.path() | |
.to_string_lossy() | |
.to_string() | |
.chars() | |
.filter(|x| x.is_numeric()) | |
.collect::<String>() | |
.parse::<i64>()?; | |
let mut sub_path = tokio::fs::read_dir(entry.path()).await?; | |
let mut paths = Vec::new(); | |
while let Some(path) = sub_path.next_entry().await? { | |
if path.path().extension() == Some("jpg".as_ref()) { | |
paths.push(path.path()); | |
} | |
} | |
paths.sort_by_key(|x| { | |
x.file_name() | |
.unwrap() | |
.to_string_lossy() | |
.to_string() | |
.chars() | |
.filter(|x| x.is_numeric()) | |
.collect::<String>() | |
.parse::<i64>() | |
.unwrap() | |
}); | |
let mut pandoc = pandoc::new(); | |
let html_path = entry.path().join("html_pages"); | |
create_dir_all(&html_path).await?; | |
for (str, path) in paths.iter().map(|x| { | |
let name = x | |
.file_name() | |
.unwrap_or_default() | |
.to_string_lossy() | |
.to_string(); | |
( | |
format!( | |
"<img src='{}' style='object-fit: contain; height: 100%; width: auto'>", | |
x.display(), | |
), | |
html_path.join(name), | |
) | |
}) { | |
dbg!(&path); | |
let with_html_extension = path.with_extension("html"); | |
let mut file = std::fs::File::create(&with_html_extension)?; | |
pandoc.add_input(&with_html_extension); | |
write!(&mut file, "{str}")?; | |
} | |
let first_picture = paths[0].clone(); | |
pandoc.add_option(pandoc::PandocOption::EpubCoverImage(first_picture)); | |
pandoc.set_output(pandoc::OutputKind::File( | |
format!("{anime} 第 {chapter} 卷.epub").into(), | |
)); | |
pandoc.execute()?; | |
} | |
Ok(()) | |
} | |
async fn download(args: &Arguments, url: &str) -> color_eyre::Result<String> { | |
let mut caps = serde_json::map::Map::new(); | |
let default = vec!["--disable-gpu", "--no-sandbox", "--disable-dev-shm-usage"]; | |
let opts = args.headless.then_some("--headless").map_or(default, |x| { | |
vec![ | |
x, | |
"--disable-gpu", | |
"--no-sandbox", | |
"--disable-dev-shm-usage", | |
] | |
}); | |
let opts = serde_json::json!({ | |
"args": opts, | |
}); | |
caps.insert("goog:chromeOptions".to_string(), opts); | |
let driver = ClientBuilder::native() | |
.capabilities(caps) | |
.connect(&format!("http://localhost:{0}", args.port.unwrap_or(0))) | |
.await?; | |
driver.goto(url).await?; | |
let client = build_client().await?; | |
let anime = driver | |
.find(Locator::Css(".title")) | |
.await? | |
.find(Locator::Css("h1")) | |
.await? | |
.find(Locator::Css("a")) | |
.await? | |
.text() | |
.await?; | |
tokio::fs::create_dir_all("anime").await?; | |
let mut tasks = Vec::new(); | |
for chpt in 1.. { | |
let last = driver.find(Locator::Css(".tip-alert")).await; | |
if last.is_ok() { | |
break; | |
} | |
tokio::fs::create_dir_all(format!("{anime}/Chapter {chpt}")).await?; | |
let page_count = get_page_count(&driver).await?; | |
for i in 1..=page_count { | |
let ele = driver.find(Locator::Id("mangaFile")).await?; | |
let image_url = ele.attr("src").await?.context("impossible")?; | |
let client_cloned = client.clone(); | |
let anime_cloned = anime.clone(); | |
let task = tokio::spawn(async move { | |
let bytes = access_image(&client_cloned, &image_url).await?; | |
tokio::fs::write(format!("{anime_cloned}/Chapter {chpt}/{i}.jpg"), bytes).await?; | |
Ok::<(), color_eyre::Report>(()) | |
}); | |
tasks.push(task); | |
let next = driver.find(Locator::Id("next")).await?; | |
next.click().await?; | |
} | |
if let Ok(x) = driver.find(Locator::Css(".pb-x")).await { | |
x.click().await?; | |
} | |
let chapter_btn = driver.find(Locator::Css(".nextC")).await?; | |
chapter_btn.click().await?; | |
while let Ok(_) = driver.find(Locator::Id("smh-msg-box")).await { | |
tokio::time::sleep(Duration::from_millis(200)).await; | |
} | |
} | |
for x in tasks { | |
x.await??; | |
} | |
Ok(anime) | |
} | |
async fn get_page_count(driver: &fantoccini::Client) -> color_eyre::Result<i64> { | |
let ele = driver.find(Locator::Css(".title")).await?; | |
let ele = ele.find_all(Locator::Css("span")).await?; | |
let stream = tokio_stream::iter(ele) | |
.then(|x| async move { x.text().await }) | |
.filter_map(|x| x.ok()) | |
.filter(|x| x.contains("(")); | |
let pages = Box::pin(stream).next().await.unwrap(); | |
let target = pages.split('/').last().unwrap(); | |
let target = target[..target.len() - 1].parse::<i64>().unwrap(); | |
Ok(target) | |
} | |
async fn build_client() -> color_eyre::Result<Client> { | |
let referrer_url = "https://www.manhuagui.com/comic/33860/459824.html"; | |
let mut headers = HeaderMap::new(); | |
headers.insert(USER_AGENT, HeaderValue::from_static("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")); | |
headers.insert(REFERER, HeaderValue::from_str(referrer_url)?); | |
headers.insert( | |
ACCEPT, | |
HeaderValue::from_static("image/webp,image/apng,image/*,*/*;q=0.8"), | |
); | |
headers.insert( | |
ACCEPT_ENCODING, | |
HeaderValue::from_static("gzip, deflate, br"), | |
); | |
headers.insert(ACCEPT_LANGUAGE, HeaderValue::from_static("en-US,en;q=0.9")); | |
headers.insert(CONNECTION, HeaderValue::from_static("keep-alive")); | |
headers.insert("Sec-Fetch-Dest", HeaderValue::from_static("image")); | |
headers.insert("Sec-Fetch-Mode", HeaderValue::from_static("no-cors")); | |
headers.insert("Sec-Fetch-Site", HeaderValue::from_static("cross-site")); | |
let client = reqwest::Client::builder() | |
.default_headers(headers) | |
.build()?; | |
client.get(referrer_url).send().await?; | |
Ok(client) | |
} | |
async fn access_image(client: &Client, image_url: &str) -> color_eyre::Result<bytes::Bytes> { | |
// Now try to access the image URL | |
let mut response = client.get(image_url).send().await?; | |
let status = response.status(); | |
println!("Successfully accessed the image. Status code: {}", status); | |
let all = response.content_length(); | |
let mut bytes = BytesMut::new(); | |
while let Some(chunk) = response.chunk().await? { | |
if let Some(x) = all { | |
println!("Percentage: {}", chunk.len() as f64 / x as f64); | |
} else { | |
println!("Downloaded Bytes: {}", bytes.len()); | |
} | |
bytes.extend(chunk); | |
} | |
let content = bytes.freeze(); | |
Ok(content) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment