Last active
January 1, 2016 09:19
-
-
Save ovatsus/8123945 to your computer and use it in GitHub Desktop.
Alternative implementation of Screen Scraping College Football Statistics (http://jamessdixon.wordpress.com/2013/12/24/screen-scraping-college-football-statistics/)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#r @"packages\FSharp.Data.2.0.0-alpha2\lib\net40\FSharp.Data.dll" | |
#r "System.Xml.Linq" | |
open System.Xml.Linq | |
open FSharp.Data | |
open FSharp.Data.Json | |
open FSharp.Data.Json.Extensions | |
open FSharp.Net | |
// Without Type Providers | |
let getRecrutRankingsJsonString (year:int) = | |
let url = "http://sports.yahoo.com/footballrecruiting/football/recruiting/teamrank/" + year.ToString() + "/BIG10/all" | |
// Using Http.RequestString from FSharp.Data instead of WebRequest directly | |
let htmlString = Http.RequestString url | |
// Store header in a separate value so we can use .Length, instead of hardcoding 23 | |
let header = "var rankingsTableData =" | |
let startPosition = htmlString.IndexOf(header) | |
let headerLength = header.Length | |
let endPosition = htmlString.IndexOf(";",startPosition) | |
htmlString.Substring(startPosition+headerLength,endPosition-startPosition-headerLength).Trim() | |
let getRecrutRankings year = | |
let data = getRecrutRankingsJsonString year | |
// Use JsonValue from FSharp.Data | |
let results = JsonValue.Parse data | |
results.AsArray() |> Array.map (fun x -> x?name.AsString(), x?rank.AsInteger()) | |
let getConferenceStandingHtmlString (year:int) = | |
let url = "http://espn.go.com/college-football/conferences/standings/_/id/5/year/" + year.ToString() + "/big-ten-conference" | |
let htmlString = Http.RequestString url | |
let divMarkerStartPosition = htmlString.IndexOf("my-teams-table"); | |
let tableStartPosition = htmlString.IndexOf("<table",divMarkerStartPosition); | |
let tableEndPosition = htmlString.IndexOf("</table",tableStartPosition); | |
htmlString.Substring(tableStartPosition, tableEndPosition- tableStartPosition+8) | |
let getConferenceStanding year school = | |
let data = getConferenceStandingHtmlString year | |
// Use XDocument from .NET 3.5 instead of old XmlDocument | |
let xmlDocument = XDocument.Parse data | |
let keyNode = xmlDocument.Descendants(XName.Get "td") | |
|> Seq.find (fun node -> node.Value = school) | |
let valueNode = keyNode.ElementsAfterSelf() |> Seq.head | |
keyNode.Value, valueNode.Value | |
let getConferenceStandings year = | |
let schools =[|"Nebraska";"Michigan";"Northwestern";"Michigan State";"Iowa"; | |
"Minnesota";"Ohio State";"Penn State";"Wisconsin"; "Purdue"; "Indiana"; "Illinois"|] | |
schools | |
|> Seq.map (getConferenceStanding year) // Note the use of partial application | |
|> Seq.sortBy snd | |
|> Seq.toList | |
|> List.rev | |
// With Type Providers | |
// Step 1: get sample data into the disk | |
open System.IO | |
let recrutRankingsSampleJson = getRecrutRankingsJsonString 2013 | |
File.WriteAllText (Path.Combine(__SOURCE_DIRECTORY__, "recrutRankings.json"), recrutRankingsSampleJson) | |
let conferenceStandingSampleHtml = getConferenceStandingHtmlString 2013 | |
File.WriteAllText (Path.Combine(__SOURCE_DIRECTORY__, "conferenceStanding.html"), conferenceStandingSampleHtml) | |
// Step 2: use the sample in the type providers | |
type RecrutRankingsType = JsonProvider<"recrutRankings.json"> | |
type ConferenceStandingType = XmlProvider<"conferenceStanding.html"> | |
let getRecrutRankingsWithTP year = | |
let data = getRecrutRankingsJsonString year | |
let results = RecrutRankingsType.Parse data | |
// you could also use the type provider inline instead of declaring it above: | |
// let results = JsonProvider<"recrutRankings.json">.Parse data | |
results |> Array.map (fun x -> x.Name, x.Rank) | |
// XmlProvider doesn't work so well with Html, but let's try anyway. | |
// I suggest using Html Agility Pack instead. See http://blog.codebeside.org/blog/2013/10/14/fsharp-for-screen-scraping/ | |
let getConferenceStandingWithTP year school = | |
let data = getConferenceStandingHtmlString year | |
let results = ConferenceStandingType.Parse data | |
let cells = | |
results.GetTrs() | |
|> Array.collect (fun tr -> tr.GetTds()) | |
// XmlProvider only has downwards navigation, so we have to workaround it by indexing into the parent collection: | |
let option1() = | |
let keyCellIndex = cells |> Array.findIndex (fun td -> td.StringValue = Some school) | |
let keyCell = cells.[keyCellIndex] | |
let valueCell = cells.[keyCellIndex+1] | |
keyCell.StringValue.Value, valueCell.StringValue.Value | |
// or by falling back to XElement like this: | |
let option2() = | |
let keyCell = cells |> Array.find (fun td -> td.StringValue = Some school) | |
let valueCell = keyCell.XElement.ElementsAfterSelf() |> Seq.head | |
keyCell.StringValue.Value, valueCell.Value | |
option1() | |
let getConferenceStandingsWithTP year = | |
let schools =[|"Nebraska";"Michigan";"Northwestern";"Michigan State";"Iowa"; | |
"Minnesota";"Ohio State";"Penn State";"Wisconsin"; "Purdue"; "Indiana"; "Illinois"|] | |
schools | |
|> Seq.map (getConferenceStandingWithTP year) // Note the use of partial application | |
|> Seq.sortBy snd | |
|> Seq.toList | |
|> List.rev |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment