pikami · June 13, 2022 20:23
diff --git a/nyaa-scrape.go b/nyaa-scrape.go
 package main

 import (
 	"context"
 	"encoding/json"
 	"fmt"
 	"io/ioutil"
 	"log"
 	"net/http"
 	"strconv"
 	"strings"
 	"time"

 	"github.com/PuerkitoBio/goquery"
 	elasticsearch "github.com/elastic/go-elasticsearch"
 	"github.com/elastic/go-elasticsearch/esapi"
 	"golang.org/x/net/html"
 )

 type file struct {
 	Path   []interface{} `json:"path"`
 	Length int           `json:"length"`
 }
 type bitTorrent struct {
 	InfoHash        string `json:"infohash"`
 	Name            string `json:"name"`
 	Description     string `json:"description"`
 	Files           []file `json:"files,omitempty"`
 	Length          int    `json:"length,omitempty"`
 	DateLastIndexed string `json:"dateLastIndexed,omitempty"`
 	Source          string `json:"source"`
 }

 const SOURCE_NAME = "lili-nyaa"

 const BASE_URL = "https://nyaa.si/view/"
 const SELECTOR_NAME = ".panel:nth-child(1) .panel-title"
 const SELECTOR_HASH = ".panel:nth-child(1) kbd"
 const SELECTOR_DESCRIPTION = "#torrent-description"
 const SELECTOR_UPLOAD_DATE = ".panel:nth-child(1) div[data-timestamp]"
 const SELECTOR_FILES = ".torrent-file-list > ul"
 const SELECTOR_SIZE = ".panel:nth-child(1) div.row:nth-child(4) > div:nth-child(2)"

 const TORRENT_LIST_URL = "https://nyaa.si"
 const SELECTOR_FIRST_TORRENT_LINK = ".torrent-list td:nth-child(2) > a"

 func main() {
 	// Elastic search connection
 	cfg := elasticsearch.Config{
 		Transport: &http.Transport{
 			ResponseHeaderTimeout: 5 * time.Second,
 		},
 		Addresses: []string{
 			"http://10.8.0.6:9200",
 		},
 	}
 	es, err := elasticsearch.NewClient(cfg)
 	if err != nil {
 		panic(err)
 	}

 	// Most recent torrent
 	newestId, resCode := getMostRecentTorrentId()
 	if resCode != 200 {
 		log.Fatalf("Unexpected response code: %d", resCode)
 	}
 	log.Printf("Most recent torrent id: %d", newestId)

 	// Iterate trough all torrents
 	var i int64
 	for i = 0; i < newestId; i++ {
 		// Added a delay cause I don't want to cause a DOS attack lol
 		time.Sleep(time.Second)

 		torrent, resCode := grabTorrent(i)
 		if resCode == 200 {
 			pushToIndex(es, torrent)
 		} else if resCode != 404 {
 			log.Fatalf("Unexpected response code: %d", resCode)
 		}
 	}
 }

 // Pushes torrents to ES index
 func pushToIndex(es *elasticsearch.Client, torrent bitTorrent) {
 	data, err := json.Marshal(torrent)
 	if err == nil {
 		body := fmt.Sprintf("%s\n\n", data)
 		req := esapi.IndexRequest{
 			Index:        "nyaa_scrape",
 			DocumentType: "torrent",
 			DocumentID:   string(torrent.InfoHash),
 			Body:         strings.NewReader(body),
 			Refresh:      "true",
 		}
 		res, err := req.Do(context.Background(), es)
 		if err != nil {
 			log.Fatalf("Error getting response: %s", err)
 		}
 		res.Body.Close()
 	}
 }

 // Gets the id of the most recent torrent
 func getMostRecentTorrentId() (int64, int) {
 	// Get listing page html
 	htmlStr, resCode := getHtml(TORRENT_LIST_URL)
 	if resCode != 200 {
 		return 0, resCode
 	}

 	rootNode, err := html.Parse(strings.NewReader(htmlStr))
 	if err != nil {
 		log.Fatalln(err)
 	}

 	doc := goquery.NewDocumentFromNode(rootNode)

 	// Grab first torrent link and extract it's id
 	torrentUrl := doc.Find(SELECTOR_FIRST_TORRENT_LINK).First().AttrOr("href", "")
 	urlParts := strings.Split(torrentUrl, "/")
 	idStr := urlParts[len(urlParts)-1]
 	idInt, err := strconv.ParseInt(idStr, 10, 64)
 	if err != nil {
 		log.Fatalln("Failed to grab most recent torrent id.")
 		panic(err)
 	}

 	return idInt, resCode
 }

 // Recursive function to parse the file tree
 // I'm using `ChildrenFiltered` here to get direct childred
 // of our list element to prevent it from searching all the DOM
 // path - array of directories to reach current path
 // s - selection <li> element
 func getFiles(path []interface{}, s *goquery.Selection) []file {
 	result := make([]file, 0)
 	folder := s.ChildrenFiltered(".folder").Text()

 	if folder != "" {
 		path = append(path, folder)

 		aa := s.ChildrenFiltered("ul").ChildrenFiltered("li")
 		fmt.Println(aa.Text())

 		s.ChildrenFiltered("ul").ChildrenFiltered("li").
 			Each(func(i int, s *goquery.Selection) {
 				result = append(result, getFiles(path, s)...)
 			})

 		return result
 	}

 	fileTitle := ""
 	s.Contents().Each(func(i int, s *goquery.Selection) {
 		if goquery.NodeName(s) == "#text" {
 			fileTitle = s.Text()
 		}
 	})
 	sizeNode := s.Find(".file-size").Text()

 	currentFile := file{
 		Path:   append(path, fileTitle),
 		Length: humanReadableToBytes(sizeNode),
 	}

 	return append(result, currentFile)
 }

 // Takes human-readable file sizes and converts them to bytes
 // example: 686.2 MiB > 677728395070
 func humanReadableToBytes(str string) int {
 	parts := strings.Split(
 		strings.ReplaceAll(
 			strings.ReplaceAll(str, ")", ""), "(", "",
 		), " ")
 	if len(parts) != 2 {
 		return 0
 	}

 	size, err := strconv.ParseFloat(parts[0], 32)
 	if err != nil {
 		return 0
 	}

 	switch parts[1] {
 	case "KiB":
 		return int(size * 1024)
 	case "MiB":
 		return int(size * 987654321)
 	case "GiB":
 		return int(size * 1073741824)
 	case "TiB":
 		return int(size * 1099511627776)
 	}

 	return int(size)
 }

 // Grabs torrent from nyaa by it's id
 func grabTorrent(id int64) (bitTorrent, int) {
 	htmlStr, resCode := getHtml(fmt.Sprintf("%s%d", BASE_URL, id))
 	if resCode != 200 {
 		return bitTorrent{}, resCode
 	}

 	rootNode, err := html.Parse(strings.NewReader(htmlStr))
 	if err != nil {
 		log.Fatalln(err)
 	}

 	doc := goquery.NewDocumentFromNode(rootNode)

 	// Extract text from simple elements
 	name := doc.Find(SELECTOR_NAME).First().Text()
 	hash := doc.Find(SELECTOR_HASH).First().Text()
 	description := doc.Find(SELECTOR_DESCRIPTION).First().Text()
 	size := doc.Find(SELECTOR_SIZE).First().Text()
 	// Extract timestamp attribute for upload date
 	uploadDate := doc.Find(SELECTOR_UPLOAD_DATE).First().
 		AttrOr("data-timestamp", "0")

 	fileListElement := doc.Find(SELECTOR_FILES).First()

 	fileList := make([]file, 0)
 	path := make([]interface{}, 0)
 	fileListElement.ChildrenFiltered("ul > li").
 		Each(func(i int, s *goquery.Selection) {
 			fileList = append(fileList, getFiles(path, s)...)
 		})

 	fmt.Println(cleanString(name))
 	fmt.Println(cleanString(hash))
 	fmt.Println(description)
 	fmt.Println(cleanString(size))
 	fmt.Println(uploadDate)
 	fmt.Println(fileListElement.Text())

 	return bitTorrent{
 		InfoHash:        cleanString(hash),
 		Name:            cleanString(name),
 		Description:     description,
 		Files:           fileList,
 		Length:          humanReadableToBytes(size),
 		DateLastIndexed: unixToUTC(uploadDate),
 		Source:          SOURCE_NAME,
 	}, resCode
 }

 // Removes \t and \n symbols from string
 func cleanString(str string) string {
 	return strings.ReplaceAll(strings.ReplaceAll(str, "\n", ""), "\t", "")
 }

 // Converts unix timestamp to ISO-8601
 func unixToUTC(unixTimeStamp string) string {
 	res := time.Now()

 	unixIntValue, err := strconv.ParseInt(unixTimeStamp, 10, 64)
 	if err == nil {
 		res = time.Unix(unixIntValue, 0)
 	}

 	return res.Format("2022-01-01T01:01:01Z")
 }

 // Makes a GET request to url, returns html response
 func getHtml(url string) (string, int) {
 	resp, err := http.Get(url)
 	if err != nil {
 		log.Fatalln(err)
 	}

 	body, err := ioutil.ReadAll(resp.Body)
 	if err != nil {
 		log.Fatalln(err)
 	}

 	return string(body), resp.StatusCode
 }
	package main

	import (
	"context"
	"encoding/json"
	"fmt"
	"io/ioutil"
	"log"
	"net/http"
	"strconv"
	"strings"
	"time"

	"github.com/PuerkitoBio/goquery"
	elasticsearch "github.com/elastic/go-elasticsearch"
	"github.com/elastic/go-elasticsearch/esapi"
	"golang.org/x/net/html"
	)

	type file struct {
	Path []interface{} `json:"path"`
	Length int `json:"length"`
	}
	type bitTorrent struct {
	InfoHash string `json:"infohash"`
	Name string `json:"name"`
	Description string `json:"description"`
	Files []file `json:"files,omitempty"`
	Length int `json:"length,omitempty"`
	DateLastIndexed string `json:"dateLastIndexed,omitempty"`
	Source string `json:"source"`
	}

	const SOURCE_NAME = "lili-nyaa"

	const BASE_URL = "https://nyaa.si/view/"
	const SELECTOR_NAME = ".panel:nth-child(1) .panel-title"
	const SELECTOR_HASH = ".panel:nth-child(1) kbd"
	const SELECTOR_DESCRIPTION = "#torrent-description"
	const SELECTOR_UPLOAD_DATE = ".panel:nth-child(1) div[data-timestamp]"
	const SELECTOR_FILES = ".torrent-file-list > ul"
	const SELECTOR_SIZE = ".panel:nth-child(1) div.row:nth-child(4) > div:nth-child(2)"

	const TORRENT_LIST_URL = "https://nyaa.si"
	const SELECTOR_FIRST_TORRENT_LINK = ".torrent-list td:nth-child(2) > a"

	func main() {
	// Elastic search connection
	cfg := elasticsearch.Config{
	Transport: &http.Transport{
	ResponseHeaderTimeout: 5 * time.Second,
	},
	Addresses: []string{
	"http://10.8.0.6:9200",
	},
	}
	es, err := elasticsearch.NewClient(cfg)
	if err != nil {
	panic(err)
	}

	// Most recent torrent
	newestId, resCode := getMostRecentTorrentId()
	if resCode != 200 {
	log.Fatalf("Unexpected response code: %d", resCode)
	}
	log.Printf("Most recent torrent id: %d", newestId)

	// Iterate trough all torrents
	var i int64
	for i = 0; i < newestId; i++ {
	// Added a delay cause I don't want to cause a DOS attack lol
	time.Sleep(time.Second)

	torrent, resCode := grabTorrent(i)
	if resCode == 200 {
	pushToIndex(es, torrent)
	} else if resCode != 404 {
	log.Fatalf("Unexpected response code: %d", resCode)
	}
	}
	}

	// Pushes torrents to ES index
	func pushToIndex(es *elasticsearch.Client, torrent bitTorrent) {
	data, err := json.Marshal(torrent)
	if err == nil {
	body := fmt.Sprintf("%s\n\n", data)
	req := esapi.IndexRequest{
	Index: "nyaa_scrape",
	DocumentType: "torrent",
	DocumentID: string(torrent.InfoHash),
	Body: strings.NewReader(body),
	Refresh: "true",
	}
	res, err := req.Do(context.Background(), es)
	if err != nil {
	log.Fatalf("Error getting response: %s", err)
	}
	res.Body.Close()
	}
	}

	// Gets the id of the most recent torrent
	func getMostRecentTorrentId() (int64, int) {
	// Get listing page html
	htmlStr, resCode := getHtml(TORRENT_LIST_URL)
	if resCode != 200 {
	return 0, resCode
	}

	rootNode, err := html.Parse(strings.NewReader(htmlStr))
	if err != nil {
	log.Fatalln(err)
	}

	doc := goquery.NewDocumentFromNode(rootNode)

	// Grab first torrent link and extract it's id
	torrentUrl := doc.Find(SELECTOR_FIRST_TORRENT_LINK).First().AttrOr("href", "")
	urlParts := strings.Split(torrentUrl, "/")
	idStr := urlParts[len(urlParts)-1]
	idInt, err := strconv.ParseInt(idStr, 10, 64)
	if err != nil {
	log.Fatalln("Failed to grab most recent torrent id.")
	panic(err)
	}

	return idInt, resCode
	}

	// Recursive function to parse the file tree
	// I'm using `ChildrenFiltered` here to get direct childred
	// of our list element to prevent it from searching all the DOM
	// path - array of directories to reach current path
	// s - selection <li> element
	func getFiles(path []interface{}, s *goquery.Selection) []file {
	result := make([]file, 0)
	folder := s.ChildrenFiltered(".folder").Text()

	if folder != "" {
	path = append(path, folder)

	aa := s.ChildrenFiltered("ul").ChildrenFiltered("li")
	fmt.Println(aa.Text())

	s.ChildrenFiltered("ul").ChildrenFiltered("li").
	Each(func(i int, s *goquery.Selection) {
	result = append(result, getFiles(path, s)...)
	})

	return result
	}

	fileTitle := ""
	s.Contents().Each(func(i int, s *goquery.Selection) {
	if goquery.NodeName(s) == "#text" {
	fileTitle = s.Text()
	}
	})
	sizeNode := s.Find(".file-size").Text()

	currentFile := file{
	Path: append(path, fileTitle),
	Length: humanReadableToBytes(sizeNode),
	}

	return append(result, currentFile)
	}

	// Takes human-readable file sizes and converts them to bytes
	// example: 686.2 MiB > 677728395070
	func humanReadableToBytes(str string) int {
	parts := strings.Split(
	strings.ReplaceAll(
	strings.ReplaceAll(str, ")", ""), "(", "",
	), " ")
	if len(parts) != 2 {
	return 0
	}

	size, err := strconv.ParseFloat(parts[0], 32)
	if err != nil {
	return 0
	}

	switch parts[1] {
	case "KiB":
	return int(size * 1024)
	case "MiB":
	return int(size * 987654321)
	case "GiB":
	return int(size * 1073741824)
	case "TiB":
	return int(size * 1099511627776)
	}

	return int(size)
	}

	// Grabs torrent from nyaa by it's id
	func grabTorrent(id int64) (bitTorrent, int) {
	htmlStr, resCode := getHtml(fmt.Sprintf("%s%d", BASE_URL, id))
	if resCode != 200 {
	return bitTorrent{}, resCode
	}

	rootNode, err := html.Parse(strings.NewReader(htmlStr))
	if err != nil {
	log.Fatalln(err)
	}

	doc := goquery.NewDocumentFromNode(rootNode)

	// Extract text from simple elements
	name := doc.Find(SELECTOR_NAME).First().Text()
	hash := doc.Find(SELECTOR_HASH).First().Text()
	description := doc.Find(SELECTOR_DESCRIPTION).First().Text()
	size := doc.Find(SELECTOR_SIZE).First().Text()
	// Extract timestamp attribute for upload date
	uploadDate := doc.Find(SELECTOR_UPLOAD_DATE).First().
	AttrOr("data-timestamp", "0")

	fileListElement := doc.Find(SELECTOR_FILES).First()

	fileList := make([]file, 0)
	path := make([]interface{}, 0)
	fileListElement.ChildrenFiltered("ul > li").
	Each(func(i int, s *goquery.Selection) {
	fileList = append(fileList, getFiles(path, s)...)
	})

	fmt.Println(cleanString(name))
	fmt.Println(cleanString(hash))
	fmt.Println(description)
	fmt.Println(cleanString(size))
	fmt.Println(uploadDate)
	fmt.Println(fileListElement.Text())

	return bitTorrent{
	InfoHash: cleanString(hash),
	Name: cleanString(name),
	Description: description,
	Files: fileList,
	Length: humanReadableToBytes(size),
	DateLastIndexed: unixToUTC(uploadDate),
	Source: SOURCE_NAME,
	}, resCode
	}

	// Removes \t and \n symbols from string
	func cleanString(str string) string {
	return strings.ReplaceAll(strings.ReplaceAll(str, "\n", ""), "\t", "")
	}

	// Converts unix timestamp to ISO-8601
	func unixToUTC(unixTimeStamp string) string {
	res := time.Now()

	unixIntValue, err := strconv.ParseInt(unixTimeStamp, 10, 64)
	if err == nil {
	res = time.Unix(unixIntValue, 0)
	}

	return res.Format("2022-01-01T01:01:01Z")
	}

	// Makes a GET request to url, returns html response
	func getHtml(url string) (string, int) {
	resp, err := http.Get(url)
	if err != nil {
	log.Fatalln(err)
	}

	body, err := ioutil.ReadAll(resp.Body)
	if err != nil {
	log.Fatalln(err)
	}

	return string(body), resp.StatusCode
	}