fsouza · August 29, 2015 13:57
diff --git a/feed_plantao_empresas.go b/feed_plantao_empresas.go
 package main

 import (
 	"flag"
 	"fmt"
 	"github.com/globocom/tsuru/db/storage"
 	"github.com/gorilla/feeds"
 	"labix.org/v2/mgo"
 	"labix.org/v2/mgo/bson"
 	"net/http"
 	"regexp"
 	"time"
 )

 const (
 	NewsURL = "http://www.bmfbovespa.com.br/agencia/corpo.asp?origem=exibir&id=%s"
 	Limit   = 100
 )

 var (
 	listenHTTP string
 	regexpNews = regexp.MustCompile(`^/bovespa/(\d+)$`)
 )

 func init() {
 	flag.StringVar(&listenHTTP, "listen", "127.0.0.1:7676", "address to listen to connections")
 	flag.Parse()
 }

 type News struct {
 	ID    string `bson:"_id"`
 	Title string
 	Date  time.Time
 }

 func (n *News) URL() string {
 	return fmt.Sprintf(NewsURL, n.ID)
 }

 func collection() (*storage.Collection, error) {
 	storage, err := storage.Open("localhost:27017", "bovespa_plantao_empresas")
 	if err != nil {
 		return nil, err
 	}
 	coll := storage.Collection("news")
 	coll.EnsureIndex(mgo.Index{Key: []string{"title"}, Background: true, Sparse: true})
 	coll.EnsureIndex(mgo.Index{Key: []string{"-date"}, Background: true, Sparse: true})
 	coll.EnsureIndex(mgo.Index{Key: []string{"title", "-date"}, Background: true, Sparse: true})
 	return coll, nil
 }

 func getFeed(query bson.M, id string) (*feeds.Feed, error) {
 	coll, err := collection()
 	if err != nil {
 		return nil, err
 	}
 	defer coll.Close()
 	var newsList []News
 	err = coll.Find(query).Sort("-date").Limit(Limit).All(&newsList)
 	if err != nil {
 		return nil, err
 	}
 	location, _ := time.LoadLocation("America/Sao_Paulo")
 	updated := time.Now()
 	if len(newsList) > 0 {
 		updated = newsList[0].Date.In(location)
 	}
 	feed := &feeds.Feed{
 		Title:       "Bovespa - Plantão Empresas",
 		Link:        &feeds.Link{Href: "http://www.bmfbovespa.com.br/Agencia-Noticias/ListarNoticias.aspx?w=" + id},
 		Description: "Notícias sobre empresas listadas na Bovespa",
 		Author:      &feeds.Author{Name: "Francisco Souza", Email: "f@souza.cc"},
 		Created:     time.Date(2014, 3, 20, 10, 0, 0, 0, location),
 		Updated:     updated,
 	}
 	for _, news := range newsList {
 		item := feeds.Item{
 			Id:          "http://plantao.souza.cc/bovespa/" + news.ID,
 			Title:       news.Title,
 			Link:        &feeds.Link{Href: "http://plantao.souza.cc/bovespa/" + news.ID},
 			Description: news.Title,
 			Author:      &feeds.Author{Name: "Bovespa", Email: "bovespa@bmfbovespa.com.br"},
 			Created:     news.Date,
 			Updated:     news.Date,
 		}
 		feed.Items = append(feed.Items, &item)
 	}
 	return feed, nil
 }

 func feedAll(w http.ResponseWriter, r *http.Request) {
 	feed, err := getFeed(bson.M{"title": bson.M{"$regex": "^((?!fii))", "$options": "i"}}, "all")
 	if err != nil {
 		http.Error(w, err.Error(), http.StatusInternalServerError)
 	}
 	atom, err := feed.ToAtom()
 	if err != nil {
 		http.Error(w, err.Error(), http.StatusInternalServerError)
 		return
 	}
 	w.Header().Add("Content-Type", "application/xml")
 	fmt.Fprint(w, atom)
 }

 func feedFIIs(w http.ResponseWriter, r *http.Request) {
 	feed, err := getFeed(bson.M{"title": bson.M{"$regex": "^fii", "$options": "i"}}, "fii")
 	if err != nil {
 		http.Error(w, err.Error(), http.StatusInternalServerError)
 		return
 	}
 	atom, err := feed.ToAtom()
 	if err != nil {
 		http.Error(w, err.Error(), http.StatusInternalServerError)
 		return
 	}
 	w.Header().Add("Content-Type", "application/xml")
 	fmt.Fprint(w, atom)
 }

 func redirectNews(w http.ResponseWriter, r *http.Request) {
 	var newsID string
 	var news News
 	parts := regexpNews.FindStringSubmatch(r.URL.Path)
 	if len(parts) > 1 {
 		newsID = parts[1]
 	} else {
 		http.Error(w, "Page not found", http.StatusNotFound)
 		return
 	}
 	coll, err := collection()
 	if err != nil {
 		http.Error(w, err.Error(), http.StatusInternalServerError)
 		return
 	}
 	defer coll.Close()
 	err = coll.FindId(newsID).One(&news)
 	if err == mgo.ErrNotFound {
 		http.Error(w, "News not found", http.StatusNotFound)
 		return
 	} else if err != nil {
 		http.Error(w, err.Error(), http.StatusInternalServerError)
 		return
 	}
 	w.Header().Add("Location", news.URL())
 	w.WriteHeader(http.StatusMovedPermanently)
 }

 func main() {
 	http.Handle("/all.atom", http.HandlerFunc(feedAll))
 	http.Handle("/fii.atom", http.HandlerFunc(feedFIIs))
 	http.Handle("/", http.HandlerFunc(redirectNews))
 	http.ListenAndServe(listenHTTP, nil)
 }
diff --git a/plantao_empresas.go b/plantao_empresas.go
 package main

 import (
 	"bytes"
 	"flag"
 	"fmt"
 	"github.com/globocom/tsuru/db/storage"
 	"io/ioutil"
 	"labix.org/v2/mgo"
 	"launchpad.net/xmlpath"
 	"log"
 	"net/http"
 	"regexp"
 	"strings"
 	"time"
 )

 const BaseURL = "http://www.bmfbovespa.com.br/Agencia-Noticias/ListarNoticias.aspx?idioma=pt-br&q=&tipoFiltro=%d&pg=%d"

 var (
 	pathLink              = xmlpath.MustCompile(`//ul[@id="linksNoticias"]/li/a`)
 	pathHrefLink          = xmlpath.MustCompile("./@href")
 	idRegexp              = regexp.MustCompile(`^ListarNoticias.aspx\?idioma=pt-br\&idNoticia=(\d+)\&.*$`)
 	replaceLessThan       = []byte{' ', '<', ' '}
 	replaceGreaterThan    = []byte{' ', '>', ' '}
 	replaceLessOrEqual    = []byte{' ', '<', '=', ' '}
 	replaceGreaterOrEqual = []byte{' ', '>', '=', ' '}
 	tickerTimer           time.Duration
 	filter                int
 )

 func init() {
 	flag.DurationVar(&tickerTimer, "interval", 10*time.Minute, "Ticker interval")
 	flag.IntVar(&filter, "filter", 0, "News filter (0 for daily, 1 for weekly)")
 	flag.Parse()
 }

 type News struct {
 	ID    string `bson:"_id"`
 	Title string
 	Date  time.Time
 }

 func collection() (*storage.Collection, error) {
 	storage, err := storage.Open("localhost:27017", "bovespa_plantao_empresas")
 	if err != nil {
 		return nil, err
 	}
 	coll := storage.Collection("news")
 	coll.EnsureIndex(mgo.Index{Key: []string{"title"}, Background: true, Sparse: true})
 	coll.EnsureIndex(mgo.Index{Key: []string{"-date"}, Background: true, Sparse: true})
 	coll.EnsureIndex(mgo.Index{Key: []string{"title", "-date"}, Background: true, Sparse: true})
 	return coll, nil
 }

 func downloadContent(page int) (*xmlpath.Node, error) {
 	url := fmt.Sprintf(BaseURL, filter, page)
 	resp, err := http.Get(url)
 	if err != nil {
 		return nil, err
 	}
 	defer resp.Body.Close()
 	content, err := ioutil.ReadAll(resp.Body)
 	if err != nil {
 		return nil, err
 	}
 	content = bytes.Replace(content, replaceLessThan, nil, -1)
 	content = bytes.Replace(content, replaceGreaterThan, nil, -1)
 	content = bytes.Replace(content, replaceLessOrEqual, nil, -1)
 	content = bytes.Replace(content, replaceGreaterOrEqual, nil, -1)
 	node, err := xmlpath.ParseHTML(bytes.NewBuffer(content))
 	if err != nil {
 		return nil, err
 	}
 	return node, err
 }

 func saveNews(news []News) {
 	coll, err := collection()
 	if err != nil {
 		log.Printf("[ERROR] Failed to save news: %s", err)
 		return
 	}
 	defer coll.Close()
 	for _, n := range news {
 		_, err = coll.UpsertId(n.ID, n)
 		if err != nil {
 			log.Printf("[ERROR] Failed to save news: %s", err)
 		}
 	}
 }

 func collectNews(node *xmlpath.Node) []News {
 	location, _ := time.LoadLocation("America/Sao_Paulo")
 	var err error
 	var newsList []News
 	iter := pathLink.Iter(node)
 	for iter.Next() {
 		var news News
 		target, ok := pathHrefLink.String(iter.Node())
 		if !ok {
 			continue
 		}
 		parts := idRegexp.FindStringSubmatch(target)
 		if len(parts) > 1 {
 			news.ID = parts[1]
 		}
 		content := iter.Node().String()
 		content = strings.TrimSpace(content)
 		parts = strings.SplitN(content, " - ", 2)
 		if len(parts) < 2 {
 			continue
 		}
 		news.Title = parts[1]
 		news.Date, err = time.ParseInLocation("02/01/2006 15:04", parts[0], location)
 		if err != nil {
 			log.Printf("[WARNING] Wrong date for news: %s", err)
 			continue
 		}
 		newsList = append(newsList, news)
 	}
 	return newsList
 }

 func run() {
 	for i := 0; ; i++ {
 		node, err := downloadContent(i + 1)
 		if err != nil {
 			log.Print(err)
 		}
 		if !pathLink.Exists(node) {
 			break
 		}
 		newsList := collectNews(node)
 		if len(newsList) > 0 {
 			saveNews(newsList)
 		}
 	}
 }

 func main() {
 	for _ = range time.Tick(tickerTimer) {
 		run()
 	}
 }
	package main

	import (
	"flag"
	"fmt"
	"github.com/globocom/tsuru/db/storage"
	"github.com/gorilla/feeds"
	"labix.org/v2/mgo"
	"labix.org/v2/mgo/bson"
	"net/http"
	"regexp"
	"time"
	)

	const (
	NewsURL = "http://www.bmfbovespa.com.br/agencia/corpo.asp?origem=exibir&id=%s"
	Limit = 100
	)

	var (
	listenHTTP string
	regexpNews = regexp.MustCompile(`^/bovespa/(\d+)$`)
	)

	func init() {
	flag.StringVar(&listenHTTP, "listen", "127.0.0.1:7676", "address to listen to connections")
	flag.Parse()
	}

	type News struct {
	ID string `bson:"_id"`
	Title string
	Date time.Time
	}

	func (n *News) URL() string {
	return fmt.Sprintf(NewsURL, n.ID)
	}

	func collection() (*storage.Collection, error) {
	storage, err := storage.Open("localhost:27017", "bovespa_plantao_empresas")
	if err != nil {
	return nil, err
	}
	coll := storage.Collection("news")
	coll.EnsureIndex(mgo.Index{Key: []string{"title"}, Background: true, Sparse: true})
	coll.EnsureIndex(mgo.Index{Key: []string{"-date"}, Background: true, Sparse: true})
	coll.EnsureIndex(mgo.Index{Key: []string{"title", "-date"}, Background: true, Sparse: true})
	return coll, nil
	}

	func getFeed(query bson.M, id string) (*feeds.Feed, error) {
	coll, err := collection()
	if err != nil {
	return nil, err
	}
	defer coll.Close()
	var newsList []News
	err = coll.Find(query).Sort("-date").Limit(Limit).All(&newsList)
	if err != nil {
	return nil, err
	}
	location, _ := time.LoadLocation("America/Sao_Paulo")
	updated := time.Now()
	if len(newsList) > 0 {
	updated = newsList[0].Date.In(location)
	}
	feed := &feeds.Feed{
	Title: "Bovespa - Plantão Empresas",
	Link: &feeds.Link{Href: "http://www.bmfbovespa.com.br/Agencia-Noticias/ListarNoticias.aspx?w=" + id},
	Description: "Notícias sobre empresas listadas na Bovespa",
	Author: &feeds.Author{Name: "Francisco Souza", Email: "f@souza.cc"},
	Created: time.Date(2014, 3, 20, 10, 0, 0, 0, location),
	Updated: updated,
	}
	for _, news := range newsList {
	item := feeds.Item{
	Id: "http://plantao.souza.cc/bovespa/" + news.ID,
	Title: news.Title,
	Link: &feeds.Link{Href: "http://plantao.souza.cc/bovespa/" + news.ID},
	Description: news.Title,
	Author: &feeds.Author{Name: "Bovespa", Email: "bovespa@bmfbovespa.com.br"},
	Created: news.Date,
	Updated: news.Date,
	}
	feed.Items = append(feed.Items, &item)
	}
	return feed, nil
	}

	func feedAll(w http.ResponseWriter, r *http.Request) {
	feed, err := getFeed(bson.M{"title": bson.M{"$regex": "^((?!fii))", "$options": "i"}}, "all")
	if err != nil {
	http.Error(w, err.Error(), http.StatusInternalServerError)
	}
	atom, err := feed.ToAtom()
	if err != nil {
	http.Error(w, err.Error(), http.StatusInternalServerError)
	return
	}
	w.Header().Add("Content-Type", "application/xml")
	fmt.Fprint(w, atom)
	}

	func feedFIIs(w http.ResponseWriter, r *http.Request) {
	feed, err := getFeed(bson.M{"title": bson.M{"$regex": "^fii", "$options": "i"}}, "fii")
	if err != nil {
	http.Error(w, err.Error(), http.StatusInternalServerError)
	return
	}
	atom, err := feed.ToAtom()
	if err != nil {
	http.Error(w, err.Error(), http.StatusInternalServerError)
	return
	}
	w.Header().Add("Content-Type", "application/xml")
	fmt.Fprint(w, atom)
	}

	func redirectNews(w http.ResponseWriter, r *http.Request) {
	var newsID string
	var news News
	parts := regexpNews.FindStringSubmatch(r.URL.Path)
	if len(parts) > 1 {
	newsID = parts[1]
	} else {
	http.Error(w, "Page not found", http.StatusNotFound)
	return
	}
	coll, err := collection()
	if err != nil {
	http.Error(w, err.Error(), http.StatusInternalServerError)
	return
	}
	defer coll.Close()
	err = coll.FindId(newsID).One(&news)
	if err == mgo.ErrNotFound {
	http.Error(w, "News not found", http.StatusNotFound)
	return
	} else if err != nil {
	http.Error(w, err.Error(), http.StatusInternalServerError)
	return
	}
	w.Header().Add("Location", news.URL())
	w.WriteHeader(http.StatusMovedPermanently)
	}

	func main() {
	http.Handle("/all.atom", http.HandlerFunc(feedAll))
	http.Handle("/fii.atom", http.HandlerFunc(feedFIIs))
	http.Handle("/", http.HandlerFunc(redirectNews))
	http.ListenAndServe(listenHTTP, nil)
	}
	package main

	import (
	"bytes"
	"flag"
	"fmt"
	"github.com/globocom/tsuru/db/storage"
	"io/ioutil"
	"labix.org/v2/mgo"
	"launchpad.net/xmlpath"
	"log"
	"net/http"
	"regexp"
	"strings"
	"time"
	)

	const BaseURL = "http://www.bmfbovespa.com.br/Agencia-Noticias/ListarNoticias.aspx?idioma=pt-br&q=&tipoFiltro=%d&pg=%d"

	var (
	pathLink = xmlpath.MustCompile(`//ul[@id="linksNoticias"]/li/a`)
	pathHrefLink = xmlpath.MustCompile("./@href")
	idRegexp = regexp.MustCompile(`^ListarNoticias.aspx\?idioma=pt-br\&idNoticia=(\d+)\&.*$`)
	replaceLessThan = []byte{' ', '<', ' '}
	replaceGreaterThan = []byte{' ', '>', ' '}
	replaceLessOrEqual = []byte{' ', '<', '=', ' '}
	replaceGreaterOrEqual = []byte{' ', '>', '=', ' '}
	tickerTimer time.Duration
	filter int
	)

	func init() {
	flag.DurationVar(&tickerTimer, "interval", 10*time.Minute, "Ticker interval")
	flag.IntVar(&filter, "filter", 0, "News filter (0 for daily, 1 for weekly)")
	flag.Parse()
	}

	type News struct {
	ID string `bson:"_id"`
	Title string
	Date time.Time
	}

	func collection() (*storage.Collection, error) {
	storage, err := storage.Open("localhost:27017", "bovespa_plantao_empresas")
	if err != nil {
	return nil, err
	}
	coll := storage.Collection("news")
	coll.EnsureIndex(mgo.Index{Key: []string{"title"}, Background: true, Sparse: true})
	coll.EnsureIndex(mgo.Index{Key: []string{"-date"}, Background: true, Sparse: true})
	coll.EnsureIndex(mgo.Index{Key: []string{"title", "-date"}, Background: true, Sparse: true})
	return coll, nil
	}

	func downloadContent(page int) (*xmlpath.Node, error) {
	url := fmt.Sprintf(BaseURL, filter, page)
	resp, err := http.Get(url)
	if err != nil {
	return nil, err
	}
	defer resp.Body.Close()
	content, err := ioutil.ReadAll(resp.Body)
	if err != nil {
	return nil, err
	}
	content = bytes.Replace(content, replaceLessThan, nil, -1)
	content = bytes.Replace(content, replaceGreaterThan, nil, -1)
	content = bytes.Replace(content, replaceLessOrEqual, nil, -1)
	content = bytes.Replace(content, replaceGreaterOrEqual, nil, -1)
	node, err := xmlpath.ParseHTML(bytes.NewBuffer(content))
	if err != nil {
	return nil, err
	}
	return node, err
	}

	func saveNews(news []News) {
	coll, err := collection()
	if err != nil {
	log.Printf("[ERROR] Failed to save news: %s", err)
	return
	}
	defer coll.Close()
	for _, n := range news {
	_, err = coll.UpsertId(n.ID, n)
	if err != nil {
	log.Printf("[ERROR] Failed to save news: %s", err)
	}
	}
	}

	func collectNews(node *xmlpath.Node) []News {
	location, _ := time.LoadLocation("America/Sao_Paulo")
	var err error
	var newsList []News
	iter := pathLink.Iter(node)
	for iter.Next() {
	var news News
	target, ok := pathHrefLink.String(iter.Node())
	if !ok {
	continue
	}
	parts := idRegexp.FindStringSubmatch(target)
	if len(parts) > 1 {
	news.ID = parts[1]
	}
	content := iter.Node().String()
	content = strings.TrimSpace(content)
	parts = strings.SplitN(content, " - ", 2)
	if len(parts) < 2 {
	continue
	}
	news.Title = parts[1]
	news.Date, err = time.ParseInLocation("02/01/2006 15:04", parts[0], location)
	if err != nil {
	log.Printf("[WARNING] Wrong date for news: %s", err)
	continue
	}
	newsList = append(newsList, news)
	}
	return newsList
	}

	func run() {
	for i := 0; ; i++ {
	node, err := downloadContent(i + 1)
	if err != nil {
	log.Print(err)
	}
	if !pathLink.Exists(node) {
	break
	}
	newsList := collectNews(node)
	if len(newsList) > 0 {
	saveNews(newsList)
	}
	}
	}

	func main() {
	for _ = range time.Tick(tickerTimer) {
	run()
	}
	}