$ brew install apache-spark
A python shell with a preconfigured SparkContext (available as sc
). It is
#!/bin/bash | |
: ${HDP_VERSION:=2.2.0.0-2041} | |
: ${SPARK_VERSION:=1.2.0} | |
: ${SPARK_DIST_PREFIX_VERSION:=2.2.0.0-82} | |
: ${SPARK_HADOOP_VERSION:=2.6.0} | |
SPARK_DIST_VERSION=$SPARK_VERSION.$SPARK_DIST_PREFIX_VERSION-bin-$SPARK_HADOOP_VERSION.$HDP_VERSION | |
SPARK_ASSEMBLY_VERSION=$SPARK_VERSION.$SPARK_DIST_PREFIX_VERSION-hadoop$SPARK_HADOOP_VERSION.$HDP_VERSION |
(by @andrestaltz)
If you prefer to watch video tutorials with live-coding, then check out this series I recorded with the same contents as in this article: Egghead.io - Introduction to Reactive Programming.
hadoop job -list | grep job_ | awk 'BEGIN{FS="\t";OFS=","};{print $1,strftime("%H:%M:%S", (systime()-int($3/1000)),1),"\""$4"\"","\""$6"\""}' |
ADD JAR s3://<s3-bucket>/jars/hive_contrib-0.5.jar; | |
CREATE TEMPORARY FUNCTION now as 'com.mt.utils.udf.Now'; | |
CREATE TEMPORARY FUNCTION user_agent_f as 'com.mt.utils.UserAgent'; | |
set hive.merge.mapredfiles=true; | |
set hive.merge.mapfiles=true; | |
set hive.merge.size.per.task=500000000; | |
CREATE EXTERNAL TABLE data |
#!/bin/sh | |
exec scala $0 "$@" | |
!# | |
import scala.collection.JavaConversions._ | |
import java.lang.management.{ManagementFactory, MemoryMXBean} | |
import java.net.URI | |
import javax.management.JMX | |
import javax.management.remote.{JMXConnectorFactory, JMXServiceURL} |
sed 's/'`echo -e "\01"`'/,/g' input_file.txt > output_file.csv |