Created
November 24, 2013 19:48
-
-
Save ya-pulser/7631527 to your computer and use it in GitHub Desktop.
Loads into neo4j database a dataset of movie - actor - director - tag
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import org.neo4j.graphdb.GraphDatabaseService; | |
import org.neo4j.graphdb.Node; | |
import org.neo4j.graphdb.Relationship; | |
import org.neo4j.graphdb.RelationshipType; | |
import org.neo4j.graphdb.Transaction; | |
import org.neo4j.graphdb.factory.GraphDatabaseFactory; | |
import org.neo4j.graphdb.index.Index; | |
import java.io.BufferedReader; | |
import java.io.FileReader; | |
import java.util.HashMap; | |
import java.util.Map; | |
public class MovieLoader { | |
final static String path = "/Users/pulser/mine/movielens"; | |
final static String neoPath = path + "/neo"; | |
// final static String neoPath = "/usr/local/Cellar/neo4j/1.9.5/libexec/data/graph.db"; | |
private static enum RelTypes implements RelationshipType { | |
ACTS, | |
DIRECTS, | |
TAGS | |
} | |
public static void main(String[] args) throws Exception { | |
final GraphDatabaseService graphDb = new GraphDatabaseFactory().newEmbeddedDatabase(neoPath); | |
final Index<Node> nodeIndex = graphDb.index().forNodes("nodes"); | |
registerShutdownHook(graphDb); | |
final Transaction tx = graphDb.beginTx(); | |
try { | |
final Map<String, Node> movie2node = doLoadMovies(graphDb, nodeIndex); | |
doLoadActors(graphDb, movie2node); | |
doLoadDirectors(graphDb, movie2node); | |
doLoadTags(graphDb, movie2node); | |
tx.success(); | |
} finally { | |
tx.finish(); | |
} | |
graphDb.shutdown(); | |
} | |
private static void registerShutdownHook(final GraphDatabaseService graphDb) { | |
// Registers a shutdown hook for the Neo4j instance so that it | |
// shuts down nicely when the VM exits (even if you "Ctrl-C" the | |
// running application). | |
Runtime.getRuntime().addShutdownHook(new Thread() { | |
@Override | |
public void run() { | |
graphDb.shutdown(); | |
} | |
}); | |
} | |
private static String getMovieHash(final String[] parts) { | |
final StringBuilder sb = new StringBuilder(); | |
for (int i = 1; i < parts.length; i++) { | |
if (sb.length() > 0) sb.append("\t"); | |
sb.append(parts[i]); | |
} | |
return sb.toString(); | |
} | |
private static Map<String, Node> doLoadMovies(final GraphDatabaseService graphDb, final Index<Node> nodeIndex) throws Exception { | |
final String[] pos = new String[]{ | |
"id", "title", "year", "x-img.imdb", "x-imdb", "x-id.rt", "x-img.rt"}; | |
final Map<String, Node> cache = new HashMap<String, Node>(); | |
final Map<String, Node> out = new HashMap<String, Node>(); | |
final BufferedReader in = new BufferedReader(new FileReader(path + "/mine.movie.dat")); | |
in.readLine(); // skip header line | |
String line; | |
int count = 0; | |
while ((line = in.readLine()) != null) { | |
if (++count % 1000 == 0) System.out.println("Loaded movies: " + count); | |
final String[] shards = line.split("\t"); | |
assert shards.length == pos.length; | |
final String movieId = shards[0]; | |
final String hash = getMovieHash(shards); | |
if (!cache.containsKey(hash)) { | |
final Node movie = graphDb.createNode(); | |
for (int i = 0; i < shards.length; i++) { | |
movie.setProperty(pos[i], shards[i]); | |
} | |
movie.setProperty("movieId", movieId); | |
movie.setProperty("type", "movie"); | |
nodeIndex.add(movie, "movie", movieId); | |
cache.put(hash, movie); | |
} | |
final Node movie = cache.get(hash); | |
out.put(movieId, movie); | |
} | |
in.close(); | |
System.out.println("Found " + cache.size() + " distinct movies from " + out.size() + " total movies"); | |
return out; | |
} | |
private static void doLoadActors(final GraphDatabaseService graphDb, final Map<String, Node> movie2id) throws Exception { | |
// movieID actorID actorName ranking | |
final String[] pos = new String[]{"movieId", "actorId", "actorName"}; | |
final Map<String, Node> actorCache = new HashMap<String, Node>(); | |
final BufferedReader in = new BufferedReader(new FileReader(path + "/movie_actors.dat")); | |
in.readLine(); // skip header line | |
String line; | |
int count = 0; | |
while ((line = in.readLine()) != null) { | |
if (++count % 10000 == 0) System.out.println("Loaded references: " + count); | |
final String[] shards = line.split("\t"); | |
assert shards.length == pos.length; | |
final String movieId = shards[0]; | |
final String actorId = shards[1]; | |
final String actorName = shards[2]; | |
if (!actorCache.containsKey(actorId)) { | |
final Node actor = graphDb.createNode(); | |
actor.setProperty("name", actorName); | |
actor.setProperty("actorId", actorId); | |
actor.setProperty("type", "actor"); | |
actorCache.put(actorId, actor); | |
} | |
final Node movie = movie2id.get(movieId); | |
if (movie == null) throw new IllegalStateException("Failed to find movie '" + movieId + "' for " + line); | |
actorCache.get(actorId).createRelationshipTo(movie, RelTypes.ACTS); | |
} | |
System.out.println("Loaded " + actorCache.size() + " different actors"); | |
in.close(); | |
} | |
private static void doLoadDirectors(final GraphDatabaseService graphDb, final Map<String, Node> movie2id) throws Exception { | |
// movieID actorID actorName ranking | |
final String[] pos = new String[]{"movieId", "directorId", "directorId"}; | |
final Map<String, Node> directorCache = new HashMap<String, Node>(); | |
final BufferedReader in = new BufferedReader(new FileReader(path + "/movie_directors.dat")); | |
in.readLine(); // skip header line | |
String line; | |
int count = 0; | |
while ((line = in.readLine()) != null) { | |
if (++count % 10000 == 0) System.out.println("Loaded references: " + count); | |
final String[] shards = line.split("\t"); | |
assert shards.length == pos.length; | |
final String movieId = shards[0]; | |
final String directorId = shards[1]; | |
final String directorName = shards[2]; | |
if (!directorCache.containsKey(directorId)) { | |
final Node director = graphDb.createNode(); | |
director.setProperty("name", directorName); | |
director.setProperty("directorId", directorId); | |
director.setProperty("type", "director"); | |
directorCache.put(directorId, director); | |
} | |
final Node movie = movie2id.get(movieId); | |
if (movie == null) throw new IllegalStateException("Failed to find movie '" + movieId + "' for " + line); | |
directorCache.get(directorId).createRelationshipTo(movie, RelTypes.DIRECTS); | |
} | |
System.out.println("Loaded " + directorCache.size() + " different directors"); | |
in.close(); | |
} | |
private static void doLoadTags(final GraphDatabaseService graphDb, final Map<String, Node> movie2id) throws Exception { | |
final Map<String, Node> tagCache = new HashMap<String, Node>(); | |
{ | |
final String[] pos = new String[]{"id", "value"}; | |
final BufferedReader in = new BufferedReader(new FileReader(path + "/tags.dat")); | |
in.readLine(); // skip header line | |
String line; | |
int count = 0; | |
while ((line = in.readLine()) != null) { | |
if (++count % 10000 == 0) System.out.println("Loaded references: " + count); | |
final String[] shards = line.split("\t"); | |
assert shards.length == pos.length; | |
final String tagId = shards[0]; | |
final String tag = shards[1]; | |
if (!tagCache.containsKey(tagId)) { | |
final Node node = graphDb.createNode(); | |
node.setProperty("tagId", tagId); | |
node.setProperty("tag", tag); | |
node.setProperty("type", "tag"); | |
tagCache.put(tagId, node); | |
} | |
} | |
System.out.println("Loaded " + tagCache.size() + " different tags"); | |
in.close(); | |
} | |
final BufferedReader in = new BufferedReader(new FileReader(path + "/movie_tags.dat")); | |
in.readLine(); // skip header line | |
String line; | |
int count = 0; | |
while ((line = in.readLine()) != null) { | |
if (++count % 10000 == 0) System.out.println("Loaded references: " + count); | |
final String[] shards = line.split("\t"); | |
final String movieId = shards[0]; | |
final String tagId = shards[1]; | |
final String weight = shards[2]; | |
final Relationship rel = tagCache.get(tagId).createRelationshipTo(movie2id.get(movieId), RelTypes.TAGS); | |
rel.setProperty("weight", Integer.valueOf(weight)); | |
} | |
in.close(); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment