Skip to content

Instantly share code, notes, and snippets.

@ya-pulser
Created November 24, 2013 19:48
Show Gist options
  • Save ya-pulser/7631527 to your computer and use it in GitHub Desktop.
Save ya-pulser/7631527 to your computer and use it in GitHub Desktop.
Loads into neo4j database a dataset of movie - actor - director - tag
import org.neo4j.graphdb.GraphDatabaseService;
import org.neo4j.graphdb.Node;
import org.neo4j.graphdb.Relationship;
import org.neo4j.graphdb.RelationshipType;
import org.neo4j.graphdb.Transaction;
import org.neo4j.graphdb.factory.GraphDatabaseFactory;
import org.neo4j.graphdb.index.Index;
import java.io.BufferedReader;
import java.io.FileReader;
import java.util.HashMap;
import java.util.Map;
public class MovieLoader {
final static String path = "/Users/pulser/mine/movielens";
final static String neoPath = path + "/neo";
// final static String neoPath = "/usr/local/Cellar/neo4j/1.9.5/libexec/data/graph.db";
private static enum RelTypes implements RelationshipType {
ACTS,
DIRECTS,
TAGS
}
public static void main(String[] args) throws Exception {
final GraphDatabaseService graphDb = new GraphDatabaseFactory().newEmbeddedDatabase(neoPath);
final Index<Node> nodeIndex = graphDb.index().forNodes("nodes");
registerShutdownHook(graphDb);
final Transaction tx = graphDb.beginTx();
try {
final Map<String, Node> movie2node = doLoadMovies(graphDb, nodeIndex);
doLoadActors(graphDb, movie2node);
doLoadDirectors(graphDb, movie2node);
doLoadTags(graphDb, movie2node);
tx.success();
} finally {
tx.finish();
}
graphDb.shutdown();
}
private static void registerShutdownHook(final GraphDatabaseService graphDb) {
// Registers a shutdown hook for the Neo4j instance so that it
// shuts down nicely when the VM exits (even if you "Ctrl-C" the
// running application).
Runtime.getRuntime().addShutdownHook(new Thread() {
@Override
public void run() {
graphDb.shutdown();
}
});
}
private static String getMovieHash(final String[] parts) {
final StringBuilder sb = new StringBuilder();
for (int i = 1; i < parts.length; i++) {
if (sb.length() > 0) sb.append("\t");
sb.append(parts[i]);
}
return sb.toString();
}
private static Map<String, Node> doLoadMovies(final GraphDatabaseService graphDb, final Index<Node> nodeIndex) throws Exception {
final String[] pos = new String[]{
"id", "title", "year", "x-img.imdb", "x-imdb", "x-id.rt", "x-img.rt"};
final Map<String, Node> cache = new HashMap<String, Node>();
final Map<String, Node> out = new HashMap<String, Node>();
final BufferedReader in = new BufferedReader(new FileReader(path + "/mine.movie.dat"));
in.readLine(); // skip header line
String line;
int count = 0;
while ((line = in.readLine()) != null) {
if (++count % 1000 == 0) System.out.println("Loaded movies: " + count);
final String[] shards = line.split("\t");
assert shards.length == pos.length;
final String movieId = shards[0];
final String hash = getMovieHash(shards);
if (!cache.containsKey(hash)) {
final Node movie = graphDb.createNode();
for (int i = 0; i < shards.length; i++) {
movie.setProperty(pos[i], shards[i]);
}
movie.setProperty("movieId", movieId);
movie.setProperty("type", "movie");
nodeIndex.add(movie, "movie", movieId);
cache.put(hash, movie);
}
final Node movie = cache.get(hash);
out.put(movieId, movie);
}
in.close();
System.out.println("Found " + cache.size() + " distinct movies from " + out.size() + " total movies");
return out;
}
private static void doLoadActors(final GraphDatabaseService graphDb, final Map<String, Node> movie2id) throws Exception {
// movieID actorID actorName ranking
final String[] pos = new String[]{"movieId", "actorId", "actorName"};
final Map<String, Node> actorCache = new HashMap<String, Node>();
final BufferedReader in = new BufferedReader(new FileReader(path + "/movie_actors.dat"));
in.readLine(); // skip header line
String line;
int count = 0;
while ((line = in.readLine()) != null) {
if (++count % 10000 == 0) System.out.println("Loaded references: " + count);
final String[] shards = line.split("\t");
assert shards.length == pos.length;
final String movieId = shards[0];
final String actorId = shards[1];
final String actorName = shards[2];
if (!actorCache.containsKey(actorId)) {
final Node actor = graphDb.createNode();
actor.setProperty("name", actorName);
actor.setProperty("actorId", actorId);
actor.setProperty("type", "actor");
actorCache.put(actorId, actor);
}
final Node movie = movie2id.get(movieId);
if (movie == null) throw new IllegalStateException("Failed to find movie '" + movieId + "' for " + line);
actorCache.get(actorId).createRelationshipTo(movie, RelTypes.ACTS);
}
System.out.println("Loaded " + actorCache.size() + " different actors");
in.close();
}
private static void doLoadDirectors(final GraphDatabaseService graphDb, final Map<String, Node> movie2id) throws Exception {
// movieID actorID actorName ranking
final String[] pos = new String[]{"movieId", "directorId", "directorId"};
final Map<String, Node> directorCache = new HashMap<String, Node>();
final BufferedReader in = new BufferedReader(new FileReader(path + "/movie_directors.dat"));
in.readLine(); // skip header line
String line;
int count = 0;
while ((line = in.readLine()) != null) {
if (++count % 10000 == 0) System.out.println("Loaded references: " + count);
final String[] shards = line.split("\t");
assert shards.length == pos.length;
final String movieId = shards[0];
final String directorId = shards[1];
final String directorName = shards[2];
if (!directorCache.containsKey(directorId)) {
final Node director = graphDb.createNode();
director.setProperty("name", directorName);
director.setProperty("directorId", directorId);
director.setProperty("type", "director");
directorCache.put(directorId, director);
}
final Node movie = movie2id.get(movieId);
if (movie == null) throw new IllegalStateException("Failed to find movie '" + movieId + "' for " + line);
directorCache.get(directorId).createRelationshipTo(movie, RelTypes.DIRECTS);
}
System.out.println("Loaded " + directorCache.size() + " different directors");
in.close();
}
private static void doLoadTags(final GraphDatabaseService graphDb, final Map<String, Node> movie2id) throws Exception {
final Map<String, Node> tagCache = new HashMap<String, Node>();
{
final String[] pos = new String[]{"id", "value"};
final BufferedReader in = new BufferedReader(new FileReader(path + "/tags.dat"));
in.readLine(); // skip header line
String line;
int count = 0;
while ((line = in.readLine()) != null) {
if (++count % 10000 == 0) System.out.println("Loaded references: " + count);
final String[] shards = line.split("\t");
assert shards.length == pos.length;
final String tagId = shards[0];
final String tag = shards[1];
if (!tagCache.containsKey(tagId)) {
final Node node = graphDb.createNode();
node.setProperty("tagId", tagId);
node.setProperty("tag", tag);
node.setProperty("type", "tag");
tagCache.put(tagId, node);
}
}
System.out.println("Loaded " + tagCache.size() + " different tags");
in.close();
}
final BufferedReader in = new BufferedReader(new FileReader(path + "/movie_tags.dat"));
in.readLine(); // skip header line
String line;
int count = 0;
while ((line = in.readLine()) != null) {
if (++count % 10000 == 0) System.out.println("Loaded references: " + count);
final String[] shards = line.split("\t");
final String movieId = shards[0];
final String tagId = shards[1];
final String weight = shards[2];
final Relationship rel = tagCache.get(tagId).createRelationshipTo(movie2id.get(movieId), RelTypes.TAGS);
rel.setProperty("weight", Integer.valueOf(weight));
}
in.close();
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment