Skip to content

Instantly share code, notes, and snippets.

View sscdotopen's full-sized avatar

Sebastian Schelter sscdotopen

View GitHub Profile
@sscdotopen
sscdotopen / order-dd.rs
Created March 7, 2024 12:14
Implementation of the incremental view maintenance example from our lecture in Differential Dataflow
extern crate timely;
extern crate differential_dataflow;
use timely::dataflow::operators::probe::Handle;
use differential_dataflow::input::Input;
use differential_dataflow::operators::*;
fn main() {
@sscdotopen
sscdotopen / session_sim.rs
Created February 10, 2021 16:13
Session similarity
// 'most_recent_neighbors' is a set of ~500 similar sessions
for neighbor_session in most_recent_neighbors.into_iter() {
let mut similarity = 0_f64;
// This returns a HashSet of the items contained in the session
let other_session_items = index.items_for_session(&neighbor_session.id);
// The 'evolving_session' is an array of items of length 'num_items_in_evolving_session'
for (pos, item_id) in evolving_session.iter().enumerate() {
if other_session_items.contains(&item_id) {
let weight = (pos + 1) as f64 / num_items_in_evolving_session as f64;
similarity += decay_factor;
@sscdotopen
sscdotopen / nested.scala
Created June 25, 2019 16:03
Serializing nested data in Spark
import org.apache.spark.sql.{Row, SaveMode}
import org.apache.spark.sql.types._
val attributesType = new MapType(StringType, StringType, valueContainsNull = false)
val historyEntryType = new StructType()
.add("intervalStart", LongType)
.add("intervalEnd", LongType)
.add("type", StringType)
.add("attributes", attributesType)

Recommender Example

Input table

Table capturing ratings that users gave to items, with schema: user_id, item_id, rating, date

Preprocessing

  • group data by user_id
  • remove groups with less than 10 items

Keybase proof

I hereby claim:

  • I am sscdotopen on github.
  • I am sscdotopen (https://keybase.io/sscdotopen) on keybase.
  • I have a public key ASBY5f_Vi_i94AxL83aHAH3zGaR_Uq3Ww4yNFXPKDnWeYwo

To claim this, I am signing this object:

package org.apache.spark.examples
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.rdd.RDD
import java.util.Random
import scala.collection.mutable
import org.apache.spark.serializer.KryoRegistrator
import com.esotericsoftware.kryo.Kryo
package eu.stratosphere.scala.examples.wordcount
import eu.stratosphere.scala.{ScalaPlan, TextFile}
import eu.stratosphere.pact.common.plan.PlanAssembler
import eu.stratosphere.scala._
import eu.stratosphere.scala.operators._
case class Author(id: Int, name: String)
@sscdotopen
sscdotopen / gist:5416101
Created April 18, 2013 20:48
ALS Testing patch
Index: core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/ALSWRFactorizer.java
===================================================================
--- core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/ALSWRFactorizer.java (Revision 1469532)
+++ core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/ALSWRFactorizer.java (Arbeitskopie)
@@ -173,6 +173,7 @@
}
for (int iteration = 0; iteration < numIterations; iteration++) {
+ long start = System.currentTimeMillis();
log.info("iteration {}", iteration);
\\
\text{transition matrix}: A \\
\text{rank vector}: x \\
\\
\text{PageRank}: x^{t+1} = A x^t \\
\\
\forall t > 2: \\
import org.apache.mahout.math.DenseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.function.IntObjectProcedure;
import org.apache.mahout.math.map.OpenIntObjectHashMap;
import java.util.Random;
public class Benchmark {
static int NUM_FEATURES = 20;