Last active
August 29, 2015 13:57
-
-
Save fsouza/9715301 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"flag" | |
"fmt" | |
"github.com/globocom/tsuru/db/storage" | |
"github.com/gorilla/feeds" | |
"labix.org/v2/mgo" | |
"labix.org/v2/mgo/bson" | |
"net/http" | |
"regexp" | |
"time" | |
) | |
const ( | |
NewsURL = "http://www.bmfbovespa.com.br/agencia/corpo.asp?origem=exibir&id=%s" | |
Limit = 100 | |
) | |
var ( | |
listenHTTP string | |
regexpNews = regexp.MustCompile(`^/bovespa/(\d+)$`) | |
) | |
func init() { | |
flag.StringVar(&listenHTTP, "listen", "127.0.0.1:7676", "address to listen to connections") | |
flag.Parse() | |
} | |
type News struct { | |
ID string `bson:"_id"` | |
Title string | |
Date time.Time | |
} | |
func (n *News) URL() string { | |
return fmt.Sprintf(NewsURL, n.ID) | |
} | |
func collection() (*storage.Collection, error) { | |
storage, err := storage.Open("localhost:27017", "bovespa_plantao_empresas") | |
if err != nil { | |
return nil, err | |
} | |
coll := storage.Collection("news") | |
coll.EnsureIndex(mgo.Index{Key: []string{"title"}, Background: true, Sparse: true}) | |
coll.EnsureIndex(mgo.Index{Key: []string{"-date"}, Background: true, Sparse: true}) | |
coll.EnsureIndex(mgo.Index{Key: []string{"title", "-date"}, Background: true, Sparse: true}) | |
return coll, nil | |
} | |
func getFeed(query bson.M, id string) (*feeds.Feed, error) { | |
coll, err := collection() | |
if err != nil { | |
return nil, err | |
} | |
defer coll.Close() | |
var newsList []News | |
err = coll.Find(query).Sort("-date").Limit(Limit).All(&newsList) | |
if err != nil { | |
return nil, err | |
} | |
location, _ := time.LoadLocation("America/Sao_Paulo") | |
updated := time.Now() | |
if len(newsList) > 0 { | |
updated = newsList[0].Date.In(location) | |
} | |
feed := &feeds.Feed{ | |
Title: "Bovespa - Plantão Empresas", | |
Link: &feeds.Link{Href: "http://www.bmfbovespa.com.br/Agencia-Noticias/ListarNoticias.aspx?w=" + id}, | |
Description: "Notícias sobre empresas listadas na Bovespa", | |
Author: &feeds.Author{Name: "Francisco Souza", Email: "f@souza.cc"}, | |
Created: time.Date(2014, 3, 20, 10, 0, 0, 0, location), | |
Updated: updated, | |
} | |
for _, news := range newsList { | |
item := feeds.Item{ | |
Id: "http://plantao.souza.cc/bovespa/" + news.ID, | |
Title: news.Title, | |
Link: &feeds.Link{Href: "http://plantao.souza.cc/bovespa/" + news.ID}, | |
Description: news.Title, | |
Author: &feeds.Author{Name: "Bovespa", Email: "bovespa@bmfbovespa.com.br"}, | |
Created: news.Date, | |
Updated: news.Date, | |
} | |
feed.Items = append(feed.Items, &item) | |
} | |
return feed, nil | |
} | |
func feedAll(w http.ResponseWriter, r *http.Request) { | |
feed, err := getFeed(bson.M{"title": bson.M{"$regex": "^((?!fii))", "$options": "i"}}, "all") | |
if err != nil { | |
http.Error(w, err.Error(), http.StatusInternalServerError) | |
} | |
atom, err := feed.ToAtom() | |
if err != nil { | |
http.Error(w, err.Error(), http.StatusInternalServerError) | |
return | |
} | |
w.Header().Add("Content-Type", "application/xml") | |
fmt.Fprint(w, atom) | |
} | |
func feedFIIs(w http.ResponseWriter, r *http.Request) { | |
feed, err := getFeed(bson.M{"title": bson.M{"$regex": "^fii", "$options": "i"}}, "fii") | |
if err != nil { | |
http.Error(w, err.Error(), http.StatusInternalServerError) | |
return | |
} | |
atom, err := feed.ToAtom() | |
if err != nil { | |
http.Error(w, err.Error(), http.StatusInternalServerError) | |
return | |
} | |
w.Header().Add("Content-Type", "application/xml") | |
fmt.Fprint(w, atom) | |
} | |
func redirectNews(w http.ResponseWriter, r *http.Request) { | |
var newsID string | |
var news News | |
parts := regexpNews.FindStringSubmatch(r.URL.Path) | |
if len(parts) > 1 { | |
newsID = parts[1] | |
} else { | |
http.Error(w, "Page not found", http.StatusNotFound) | |
return | |
} | |
coll, err := collection() | |
if err != nil { | |
http.Error(w, err.Error(), http.StatusInternalServerError) | |
return | |
} | |
defer coll.Close() | |
err = coll.FindId(newsID).One(&news) | |
if err == mgo.ErrNotFound { | |
http.Error(w, "News not found", http.StatusNotFound) | |
return | |
} else if err != nil { | |
http.Error(w, err.Error(), http.StatusInternalServerError) | |
return | |
} | |
w.Header().Add("Location", news.URL()) | |
w.WriteHeader(http.StatusMovedPermanently) | |
} | |
func main() { | |
http.Handle("/all.atom", http.HandlerFunc(feedAll)) | |
http.Handle("/fii.atom", http.HandlerFunc(feedFIIs)) | |
http.Handle("/", http.HandlerFunc(redirectNews)) | |
http.ListenAndServe(listenHTTP, nil) | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"bytes" | |
"flag" | |
"fmt" | |
"github.com/globocom/tsuru/db/storage" | |
"io/ioutil" | |
"labix.org/v2/mgo" | |
"launchpad.net/xmlpath" | |
"log" | |
"net/http" | |
"regexp" | |
"strings" | |
"time" | |
) | |
const BaseURL = "http://www.bmfbovespa.com.br/Agencia-Noticias/ListarNoticias.aspx?idioma=pt-br&q=&tipoFiltro=%d&pg=%d" | |
var ( | |
pathLink = xmlpath.MustCompile(`//ul[@id="linksNoticias"]/li/a`) | |
pathHrefLink = xmlpath.MustCompile("./@href") | |
idRegexp = regexp.MustCompile(`^ListarNoticias.aspx\?idioma=pt-br\&idNoticia=(\d+)\&.*$`) | |
replaceLessThan = []byte{' ', '<', ' '} | |
replaceGreaterThan = []byte{' ', '>', ' '} | |
replaceLessOrEqual = []byte{' ', '<', '=', ' '} | |
replaceGreaterOrEqual = []byte{' ', '>', '=', ' '} | |
tickerTimer time.Duration | |
filter int | |
) | |
func init() { | |
flag.DurationVar(&tickerTimer, "interval", 10*time.Minute, "Ticker interval") | |
flag.IntVar(&filter, "filter", 0, "News filter (0 for daily, 1 for weekly)") | |
flag.Parse() | |
} | |
type News struct { | |
ID string `bson:"_id"` | |
Title string | |
Date time.Time | |
} | |
func collection() (*storage.Collection, error) { | |
storage, err := storage.Open("localhost:27017", "bovespa_plantao_empresas") | |
if err != nil { | |
return nil, err | |
} | |
coll := storage.Collection("news") | |
coll.EnsureIndex(mgo.Index{Key: []string{"title"}, Background: true, Sparse: true}) | |
coll.EnsureIndex(mgo.Index{Key: []string{"-date"}, Background: true, Sparse: true}) | |
coll.EnsureIndex(mgo.Index{Key: []string{"title", "-date"}, Background: true, Sparse: true}) | |
return coll, nil | |
} | |
func downloadContent(page int) (*xmlpath.Node, error) { | |
url := fmt.Sprintf(BaseURL, filter, page) | |
resp, err := http.Get(url) | |
if err != nil { | |
return nil, err | |
} | |
defer resp.Body.Close() | |
content, err := ioutil.ReadAll(resp.Body) | |
if err != nil { | |
return nil, err | |
} | |
content = bytes.Replace(content, replaceLessThan, nil, -1) | |
content = bytes.Replace(content, replaceGreaterThan, nil, -1) | |
content = bytes.Replace(content, replaceLessOrEqual, nil, -1) | |
content = bytes.Replace(content, replaceGreaterOrEqual, nil, -1) | |
node, err := xmlpath.ParseHTML(bytes.NewBuffer(content)) | |
if err != nil { | |
return nil, err | |
} | |
return node, err | |
} | |
func saveNews(news []News) { | |
coll, err := collection() | |
if err != nil { | |
log.Printf("[ERROR] Failed to save news: %s", err) | |
return | |
} | |
defer coll.Close() | |
for _, n := range news { | |
_, err = coll.UpsertId(n.ID, n) | |
if err != nil { | |
log.Printf("[ERROR] Failed to save news: %s", err) | |
} | |
} | |
} | |
func collectNews(node *xmlpath.Node) []News { | |
location, _ := time.LoadLocation("America/Sao_Paulo") | |
var err error | |
var newsList []News | |
iter := pathLink.Iter(node) | |
for iter.Next() { | |
var news News | |
target, ok := pathHrefLink.String(iter.Node()) | |
if !ok { | |
continue | |
} | |
parts := idRegexp.FindStringSubmatch(target) | |
if len(parts) > 1 { | |
news.ID = parts[1] | |
} | |
content := iter.Node().String() | |
content = strings.TrimSpace(content) | |
parts = strings.SplitN(content, " - ", 2) | |
if len(parts) < 2 { | |
continue | |
} | |
news.Title = parts[1] | |
news.Date, err = time.ParseInLocation("02/01/2006 15:04", parts[0], location) | |
if err != nil { | |
log.Printf("[WARNING] Wrong date for news: %s", err) | |
continue | |
} | |
newsList = append(newsList, news) | |
} | |
return newsList | |
} | |
func run() { | |
for i := 0; ; i++ { | |
node, err := downloadContent(i + 1) | |
if err != nil { | |
log.Print(err) | |
} | |
if !pathLink.Exists(node) { | |
break | |
} | |
newsList := collectNews(node) | |
if len(newsList) > 0 { | |
saveNews(newsList) | |
} | |
} | |
} | |
func main() { | |
for _ = range time.Tick(tickerTimer) { | |
run() | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment