Created
June 13, 2022 20:23
-
-
Save pikami/1bc4835288c857fe73552711506377c8 to your computer and use it in GitHub Desktop.
From blog post https://pikami.org/blog/scraping-torrent-sites
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"context" | |
"encoding/json" | |
"fmt" | |
"io/ioutil" | |
"log" | |
"net/http" | |
"strconv" | |
"strings" | |
"time" | |
"github.com/PuerkitoBio/goquery" | |
elasticsearch "github.com/elastic/go-elasticsearch" | |
"github.com/elastic/go-elasticsearch/esapi" | |
"golang.org/x/net/html" | |
) | |
type file struct { | |
Path []interface{} `json:"path"` | |
Length int `json:"length"` | |
} | |
type bitTorrent struct { | |
InfoHash string `json:"infohash"` | |
Name string `json:"name"` | |
Description string `json:"description"` | |
Files []file `json:"files,omitempty"` | |
Length int `json:"length,omitempty"` | |
DateLastIndexed string `json:"dateLastIndexed,omitempty"` | |
Source string `json:"source"` | |
} | |
const SOURCE_NAME = "lili-nyaa" | |
const BASE_URL = "https://nyaa.si/view/" | |
const SELECTOR_NAME = ".panel:nth-child(1) .panel-title" | |
const SELECTOR_HASH = ".panel:nth-child(1) kbd" | |
const SELECTOR_DESCRIPTION = "#torrent-description" | |
const SELECTOR_UPLOAD_DATE = ".panel:nth-child(1) div[data-timestamp]" | |
const SELECTOR_FILES = ".torrent-file-list > ul" | |
const SELECTOR_SIZE = ".panel:nth-child(1) div.row:nth-child(4) > div:nth-child(2)" | |
const TORRENT_LIST_URL = "https://nyaa.si" | |
const SELECTOR_FIRST_TORRENT_LINK = ".torrent-list td:nth-child(2) > a" | |
func main() { | |
// Elastic search connection | |
cfg := elasticsearch.Config{ | |
Transport: &http.Transport{ | |
ResponseHeaderTimeout: 5 * time.Second, | |
}, | |
Addresses: []string{ | |
"http://10.8.0.6:9200", | |
}, | |
} | |
es, err := elasticsearch.NewClient(cfg) | |
if err != nil { | |
panic(err) | |
} | |
// Most recent torrent | |
newestId, resCode := getMostRecentTorrentId() | |
if resCode != 200 { | |
log.Fatalf("Unexpected response code: %d", resCode) | |
} | |
log.Printf("Most recent torrent id: %d", newestId) | |
// Iterate trough all torrents | |
var i int64 | |
for i = 0; i < newestId; i++ { | |
// Added a delay cause I don't want to cause a DOS attack lol | |
time.Sleep(time.Second) | |
torrent, resCode := grabTorrent(i) | |
if resCode == 200 { | |
pushToIndex(es, torrent) | |
} else if resCode != 404 { | |
log.Fatalf("Unexpected response code: %d", resCode) | |
} | |
} | |
} | |
// Pushes torrents to ES index | |
func pushToIndex(es *elasticsearch.Client, torrent bitTorrent) { | |
data, err := json.Marshal(torrent) | |
if err == nil { | |
body := fmt.Sprintf("%s\n\n", data) | |
req := esapi.IndexRequest{ | |
Index: "nyaa_scrape", | |
DocumentType: "torrent", | |
DocumentID: string(torrent.InfoHash), | |
Body: strings.NewReader(body), | |
Refresh: "true", | |
} | |
res, err := req.Do(context.Background(), es) | |
if err != nil { | |
log.Fatalf("Error getting response: %s", err) | |
} | |
res.Body.Close() | |
} | |
} | |
// Gets the id of the most recent torrent | |
func getMostRecentTorrentId() (int64, int) { | |
// Get listing page html | |
htmlStr, resCode := getHtml(TORRENT_LIST_URL) | |
if resCode != 200 { | |
return 0, resCode | |
} | |
rootNode, err := html.Parse(strings.NewReader(htmlStr)) | |
if err != nil { | |
log.Fatalln(err) | |
} | |
doc := goquery.NewDocumentFromNode(rootNode) | |
// Grab first torrent link and extract it's id | |
torrentUrl := doc.Find(SELECTOR_FIRST_TORRENT_LINK).First().AttrOr("href", "") | |
urlParts := strings.Split(torrentUrl, "/") | |
idStr := urlParts[len(urlParts)-1] | |
idInt, err := strconv.ParseInt(idStr, 10, 64) | |
if err != nil { | |
log.Fatalln("Failed to grab most recent torrent id.") | |
panic(err) | |
} | |
return idInt, resCode | |
} | |
// Recursive function to parse the file tree | |
// I'm using `ChildrenFiltered` here to get direct childred | |
// of our list element to prevent it from searching all the DOM | |
// path - array of directories to reach current path | |
// s - selection <li> element | |
func getFiles(path []interface{}, s *goquery.Selection) []file { | |
result := make([]file, 0) | |
folder := s.ChildrenFiltered(".folder").Text() | |
if folder != "" { | |
path = append(path, folder) | |
aa := s.ChildrenFiltered("ul").ChildrenFiltered("li") | |
fmt.Println(aa.Text()) | |
s.ChildrenFiltered("ul").ChildrenFiltered("li"). | |
Each(func(i int, s *goquery.Selection) { | |
result = append(result, getFiles(path, s)...) | |
}) | |
return result | |
} | |
fileTitle := "" | |
s.Contents().Each(func(i int, s *goquery.Selection) { | |
if goquery.NodeName(s) == "#text" { | |
fileTitle = s.Text() | |
} | |
}) | |
sizeNode := s.Find(".file-size").Text() | |
currentFile := file{ | |
Path: append(path, fileTitle), | |
Length: humanReadableToBytes(sizeNode), | |
} | |
return append(result, currentFile) | |
} | |
// Takes human-readable file sizes and converts them to bytes | |
// example: 686.2 MiB > 677728395070 | |
func humanReadableToBytes(str string) int { | |
parts := strings.Split( | |
strings.ReplaceAll( | |
strings.ReplaceAll(str, ")", ""), "(", "", | |
), " ") | |
if len(parts) != 2 { | |
return 0 | |
} | |
size, err := strconv.ParseFloat(parts[0], 32) | |
if err != nil { | |
return 0 | |
} | |
switch parts[1] { | |
case "KiB": | |
return int(size * 1024) | |
case "MiB": | |
return int(size * 987654321) | |
case "GiB": | |
return int(size * 1073741824) | |
case "TiB": | |
return int(size * 1099511627776) | |
} | |
return int(size) | |
} | |
// Grabs torrent from nyaa by it's id | |
func grabTorrent(id int64) (bitTorrent, int) { | |
htmlStr, resCode := getHtml(fmt.Sprintf("%s%d", BASE_URL, id)) | |
if resCode != 200 { | |
return bitTorrent{}, resCode | |
} | |
rootNode, err := html.Parse(strings.NewReader(htmlStr)) | |
if err != nil { | |
log.Fatalln(err) | |
} | |
doc := goquery.NewDocumentFromNode(rootNode) | |
// Extract text from simple elements | |
name := doc.Find(SELECTOR_NAME).First().Text() | |
hash := doc.Find(SELECTOR_HASH).First().Text() | |
description := doc.Find(SELECTOR_DESCRIPTION).First().Text() | |
size := doc.Find(SELECTOR_SIZE).First().Text() | |
// Extract timestamp attribute for upload date | |
uploadDate := doc.Find(SELECTOR_UPLOAD_DATE).First(). | |
AttrOr("data-timestamp", "0") | |
fileListElement := doc.Find(SELECTOR_FILES).First() | |
fileList := make([]file, 0) | |
path := make([]interface{}, 0) | |
fileListElement.ChildrenFiltered("ul > li"). | |
Each(func(i int, s *goquery.Selection) { | |
fileList = append(fileList, getFiles(path, s)...) | |
}) | |
fmt.Println(cleanString(name)) | |
fmt.Println(cleanString(hash)) | |
fmt.Println(description) | |
fmt.Println(cleanString(size)) | |
fmt.Println(uploadDate) | |
fmt.Println(fileListElement.Text()) | |
return bitTorrent{ | |
InfoHash: cleanString(hash), | |
Name: cleanString(name), | |
Description: description, | |
Files: fileList, | |
Length: humanReadableToBytes(size), | |
DateLastIndexed: unixToUTC(uploadDate), | |
Source: SOURCE_NAME, | |
}, resCode | |
} | |
// Removes \t and \n symbols from string | |
func cleanString(str string) string { | |
return strings.ReplaceAll(strings.ReplaceAll(str, "\n", ""), "\t", "") | |
} | |
// Converts unix timestamp to ISO-8601 | |
func unixToUTC(unixTimeStamp string) string { | |
res := time.Now() | |
unixIntValue, err := strconv.ParseInt(unixTimeStamp, 10, 64) | |
if err == nil { | |
res = time.Unix(unixIntValue, 0) | |
} | |
return res.Format("2022-01-01T01:01:01Z") | |
} | |
// Makes a GET request to url, returns html response | |
func getHtml(url string) (string, int) { | |
resp, err := http.Get(url) | |
if err != nil { | |
log.Fatalln(err) | |
} | |
body, err := ioutil.ReadAll(resp.Body) | |
if err != nil { | |
log.Fatalln(err) | |
} | |
return string(body), resp.StatusCode | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment