Skip to content

Instantly share code, notes, and snippets.

@kritzikratzi
Created February 11, 2012 16:04
Show Gist options
  • Save kritzikratzi/1801406 to your computer and use it in GitHub Desktop.
Save kritzikratzi/1801406 to your computer and use it in GitHub Desktop.
Parsing wikipedia is funky ...
package jobs;
import static utils.MWUtils.fetch;
import static utils.MWUtils.findReferences;
import static utils.MWUtils.getTagParameter;
import static utils.MWUtils.removeLinks;
import static utils.MWUtils.clean;
import static utils.MWUtils.findHeadlines;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.TreeMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.joda.time.DateTime;
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;
import controllers.Admin;
import models.Episode;
import models.Series;
import play.Logger;
import play.db.jpa.JPA;
import play.jobs.Every;
import play.jobs.Job;
import utils.MWUtils;
@Every("1min")
public class RefreshJob extends Job<RefreshJob.Result>{
// We need those to parse the dates ...
private final static Pattern shortPattern = Pattern.compile( "Start date\\|([0-9]{4}\\|[0-9]{1,2}\\|[0-9]{1,2})", Pattern.CASE_INSENSITIVE );
private final static Pattern longPattern = Pattern.compile( "([a-z]+\\s+[0-9]+,\\s+[0-9]+)", Pattern.CASE_INSENSITIVE );
private final static DateTimeFormatter shortParser = DateTimeFormat.forPattern( "yyyy|MM|dd" );
private final static DateTimeFormatter longParser = DateTimeFormat.forPattern( "MMMM dd, yyyy" ).withLocale( Locale.ENGLISH );
private final static Pattern numberPattern = Pattern.compile( "([0-9]+)" );
private final static Pattern episodeNumberPattern = Pattern.compile( "([0-9]+)\\.([0-9]+)" );
private final static Pattern prodCodePattern = Pattern.compile( "#([0-9]+)\\.([0-9]+)" );
private final static Pattern articleSeasonPattern = Pattern.compile( ".*\\((Season|Series) ([0-9]+)\\)$", Pattern.CASE_INSENSITIVE );
private final static Pattern seasonHeadlinePattern = Pattern.compile( "(Season|Series) ([0-9]+).*", Pattern.CASE_INSENSITIVE );
private final static Pattern weirdSeasonHeadlinePattern = Pattern.compile( "(Season|Series) ([A-Z]+).*", Pattern.CASE_INSENSITIVE );
private final Long seriesId;
public RefreshJob(){
seriesId = null;
}
public RefreshJob( Long seriesId ){
this.seriesId = seriesId;
}
@Override
public Result doJobWithResult() throws Exception {
Series series;
if( seriesId == null ) series = Series.find( "order by lastRefresh asc" ).first();
else series = Series.findById( seriesId );
if( series == null ){
Logger.info( "Background Job: no such series: ", seriesId );
return new Result( null, null, null );
}
series.lastRefresh = new DateTime();
series.save();
JPA.em().getTransaction().commit();
JPA.em().getTransaction().begin();
Logger.info( "Background Job: refreshing %s", series.name );
ArrayList<Episode> episodes = fetchEpisodes( series );
ArrayList<Episode> added = new ArrayList<Episode>();
for( Episode episode : episodes ){
try{
// Does this already exist?
Episode old = Episode.find( "series = ? and episodeNumber = ?", series, episode.episodeNumber ).first();
if( old == null ){
episode.save();
added.add( episode );
}
else if( !old.equals( episode ) ){
// update the interesting fields ..
old.title = episode.title;
old.airDate = episode.airDate;
old.episodeNumber = episode.episodeNumber;
old.episodeNumberString = episode.episodeNumberString;
old.episodeNumberInSeason = episode.episodeNumberInSeason;
old.episodeNumberInSeasonString = episode.episodeNumberInSeasonString;
old.season = episode.season;
old.save();
}
}
catch( Exception e ){
Logger.info( "obj= %s", episode.toString() );
e.printStackTrace();
}
}
series.lastSuccess = new DateTime();
series.save();
return new Result( series, episodes, added );
}
public static class Result{
public Series series;
public List<Episode> episodes;
public List<Episode> added;
public Result( Series series, List<Episode> episodes, List<Episode> added ){
this.series = series;
this.episodes = episodes;
this.added = added;
}
}
public static ArrayList<Episode> fetchEpisodes( Series series ){
// grab the main page ...
TreeMap<String, String> articleContents = new TreeMap<String, String>();
String content = MWUtils.fetch( series.wikiEpisodeUrl );
articleContents.put( series.wikiEpisodeUrl, content );
ArrayList<String> refs = findReferences( ":", content );
for( String ref : refs ){
String articleTitle = ref.substring( 3, ref.length() - 2 );
Logger.info( "Fetching %s", articleTitle );
articleContents.put( articleTitle, fetch( articleTitle ) );
}
// Now grab all {{Episode ...}} tags
ArrayList<Episode> episodes = new ArrayList<Episode>( refs.size() );
for( String articleTitle : articleContents.keySet() ){
Matcher matcher;
String articleContent = articleContents.get( articleTitle );
Logger.info( "Scanning %s", articleTitle );
refs = MWUtils.findReferences( "Episode", articleContent );
for( String ref : refs ){
Episode episode = new Episode();
String num = clean( getTagParameter( "EpisodeNumber", ref, "-1" ) );
episode.series = series;
episode.title = clean( getTagParameter( "Title", ref, "?" ) );
String dateStr = getTagParameter( "OriginalAirDate", ref, "" );
// Fucked up ting number 1: episodeNumber might be of the format "SS.EE"
if( numberPattern.matcher( num ).matches() ){
episode.episodeNumber = firstInt( clean( getTagParameter( "EpisodeNumber", ref, "-1" ) ) );
episode.episodeNumberInSeason = firstInt( clean( getTagParameter( "EpisodeNumber2", ref, getTagParameter( "EpisodeNumber", ref, "-1" ) ) ) );
episode.episodeNumberString = clean( getTagParameter( "EpisodeNumber", ref, "-1" ) );
episode.episodeNumberInSeasonString = clean( getTagParameter( "EpisodeNumber2", ref, getTagParameter( "EpisodeNumber", ref, "-1" ) ) );
}
else if( ( matcher = episodeNumberPattern.matcher( num ) ).matches() ){
int S = firstInt( matcher.group( 1 ) );
int E = firstInt( matcher.group( 2 ) );
episode.episodeNumber = S*100 + E;
episode.episodeNumberString = episode.episodeNumber + "";
episode.episodeNumberInSeason = E;
episode.episodeNumberInSeasonString = E + "";
episode.season = S;
}
else{
Logger.error( "The world is not going to stand tomorrow. EpisodeNumber was formatted as %s", num );
episode.episodeNumber = firstInt( clean( getTagParameter( "EpisodeNumber", ref, "-1" ) ) );
episode.episodeNumberInSeason = firstInt( clean( getTagParameter( "EpisodeNumber2", ref, getTagParameter( "EpisodeNumber", ref, "-1" ) ) ) );
episode.episodeNumberString = clean( getTagParameter( "EpisodeNumber", ref, "-1" ) );
episode.episodeNumberInSeasonString = clean( getTagParameter( "EpisodeNumber2", ref, getTagParameter( "EpisodeNumber", ref, "-1" ) ) );
}
// More fuck ups:
// Sometimes epNum2 and epNum are reversed ...
if( episode.episodeNumber > 0 && episode.episodeNumberInSeason > 0 && episode.episodeNumber < episode.episodeNumberInSeason ){
// swap
int tmpNum = episode.episodeNumber;
String tmpStr = episode.episodeNumberString;
episode.episodeNumber = episode.episodeNumberInSeason;
episode.episodeNumberString = episode.episodeNumberInSeasonString;
episode.episodeNumberInSeason = tmpNum;
episode.episodeNumberInSeasonString = tmpStr;
}
String prodCode = getTagParameter( "ProdCode", ref, null );
// Let's see how we can _properly_ figure out infos about season/episode
// we only give a shit if season was not yet found ...
if( prodCode != null && episode.season <= 0 ){
if( prodCode.toLowerCase().equals( "pilot" ) ){
episode.season = 0;
episode.episodeNumber = 0;
episode.episodeNumberInSeason = 0;
episode.episodeNumberString = "Pilot";
episode.episodeNumberInSeasonString = "Pilot";
}
else if( ( matcher = prodCodePattern.matcher( prodCode ) ).matches() ){
episode.season = firstInt( matcher.group( 1 ) );
episode.episodeNumberInSeasonString = matcher.group( 2 );
episode.episodeNumberInSeason = firstInt( matcher.group( 2 ) );
}
else{
//Logger.info( "> Unknown prod code format: %s; maybe an episode. maybe not. dunno. ", prodCode );
/*episode.season = -1;
episode.episodeNumberInSeason = -1;
episode.episodeNumberInSeasonString = prodCode;*/
}
}
// Still no season number? Maybe we can guess it correctly from the title
if( episode.season <= 0 && ( matcher = articleSeasonPattern.matcher( articleTitle ) ).matches() ){
episode.season = firstInt( matcher.group( 1 ) );
}
// OMG, still no season?
// maybe we get a clue in a headline ...
if( episode.season <= 0 ){
// this is somewhat unefficient, but i just don't give a shit ...
String substr = articleContent.substring( 0, articleContent.indexOf( ref ) );
ArrayList<String> headlines = findHeadlines( substr );
int N = headlines.size();
if( N > 0 && ( N = getSeasonByHeadline( headlines.get( N-1 ) ) ) > 0 ){
// holy shit, this worked?
episode.season = N;
}
}
if( ( matcher = shortPattern.matcher( dateStr ) ).find() )
episode.airDate = shortParser.parseLocalDate( matcher.group( 1 ) );
else if( ( matcher = longPattern.matcher( dateStr ) ).find() )
episode.airDate = longParser.parseLocalDate( matcher.group( 1 ) );
else
episode.airDate = null;
episodes.add( episode );
}
}
// All done, do we have major problems identifying seasons?
boolean majorProblems = true;
for( Episode e : episodes ) majorProblems &= e.season <= 0;
if( majorProblems ){
// let's do it brutally, should be fine though!
Logger.info( "> Had major trouble identifying season. Just assigning shit randomly now" );
int season = 1;
int oldEpisode = -1;
for( Episode e : episodes ){
if( e.episodeNumberInSeason < oldEpisode ) season ++;
e.season = season;
oldEpisode = e.episodeNumberInSeason;
}
}
// Continue .. do we have major problems identifying episodeNumberInSeason?
// omg, the hacks get worse and worse
majorProblems = false;
int prevSeason = 1, prevEpInS = 1, newNum = 1;
for( Episode e : episodes ){
if( majorProblems ){
newNum ++;
}
if( e.season > prevSeason && e.episodeNumberInSeason > prevEpInS ){
// this CAN NOT BE RIGHT!
newNum = 1;
majorProblems = true;
}
if( e.season > 0 ){
prevSeason = e.season;
prevEpInS = e.episodeNumberInSeason;
}
if( majorProblems ){
e.episodeNumberInSeason = newNum;
e.episodeNumberInSeasonString = newNum + "";
}
}
for( Episode e : episodes ){
Logger.info( "> Found %s", e.toString() );
}
return episodes;
}
private static int firstInt( String str ){
Matcher matcher = numberPattern.matcher( str );
if( matcher.find() ){
return Integer.parseInt( matcher.group( 1 ) );
}
return -1;
}
/**
* FUCK YOU, WIKIPEDIA! :)
*
* tries to parse the season numbers from headlines as
* "season 1 (2009)"
* "season 1"
* "season one"
* @param string
*
* @return the parsed season number, or -1
*/
private static int getSeasonByHeadline( String text ){
Matcher matcher;
if( ( matcher = seasonHeadlinePattern.matcher( text ) ).matches() ){
return firstInt( matcher.group( 2 ) );
}
else if( ( matcher = weirdSeasonHeadlinePattern.matcher( text ) ).matches() ){
String num = matcher.group( 2 ).toLowerCase();
if( num.equals( "one" ) ) return 1;
if( num.equals( "two" ) ) return 2;
if( num.equals( "three" ) ) return 3;
if( num.equals( "four" ) ) return 4;
if( num.equals( "five" ) ) return 5;
if( num.equals( "six" ) ) return 6;
if( num.equals( "seven" ) ) return 7;
if( num.equals( "eight" ) ) return 8;
if( num.equals( "nine" ) ) return 9;
if( num.equals( "ten" ) ) return 10;
if( num.equals( "eleven" ) ) return 11;
if( num.equals( "twelve" ) ) return 12;
}
return -1;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment