useful links:
$ sudo apt-get update
$ sudo apt-get install software-properties-common
$ sudo add-apt-repository ppa:webupd8team/java
$ sudo apt-get update
$ sudo apt-get install oracle-java8-installer
# Setup JAVA_HOME
$ sudo update-alternatives --config java
$ sudo nano /etc/environment
JAVA_HOME="/usr/lib/jvm/java-8-oracle"
echo "deb http://www.apache.org/dist/cassandra/debian 36x main" | sudo tee -a /etc/apt/sources.list.d/cassandra.sources.list
echo "deb-src http://www.apache.org/dist/cassandra/debian 36x main" | sudo tee -a /etc/apt/sources.list.d/cassandra.sources.list
# Avoid keys warning
gpg --keyserver pgp.mit.edu --recv-keys F758CE318D77295D
gpg --export --armor F758CE318D77295D | sudo apt-key add -
gpg --keyserver pgp.mit.edu --recv-keys 2B5C1B00
gpg --export --armor 2B5C1B00 | sudo apt-key add -
gpg --keyserver pgp.mit.edu --recv-keys 0353B12C
gpg --export --armor 0353B12C | sudo apt-key add -
sudo apt-get update
# Install
sudo apt-get install cassandra
# update, if necessary, listen_address, and rpc_address in:
sudo nano /etc/cassandra/cassandra.yaml
cqlsh
CREATE KEYSPACE test WITH REPLICATION = { 'class' : 'SimpleStrategy', 'replication_factor' : 1 };
use test;
CREATE TABLE kv (
k varchar PRIMARY KEY,
v varchar
);
insert into test.kv(k,v) values('hello', 'world');
select * from kv;
sudo apt-get install git
# Goto http://spark.apache.org/downloads.html
# download Spark Release with pre-built hadoop
wget http://mirror.ox.ac.uk/sites/rsync.apache.org/spark/spark-1.6.1/spark-1.6.1-bin-hadoop2.6.tgz
tar -xvzf spark-1.6.1-bin-hadoop2.6.tgz
mv spark-1.6.1-bin-hadoop2.6 spark
cd spark
./bin/run-example SparkPi 10
bin/spark-shell
var file = sc.textFile("README.md")
file.count()
cd conf
cp spark-env.sh.template spark-env.sh
nano spark-env.sh
PYSPARK_PYTHON=python3
cd ..
bin/pyspark
f = sc.textFile('README.md')
f.count()
cp conf/spark-defaults.conf.template conf/spark-defaults.conf
nano conf/spark-defaults.conf
spark.jars.packages datastax:spark-cassandra-connector:1.6.0-s_2.10
spark.cassandra.connection.host 127.0.0.1
bin/spark-shell
import com.datastax.spark.connector._
val rdd = sc.cassandraTable("test", "kv")
println(rdd.first)
sqlContext.read\
.format("org.apache.spark.sql.cassandra")\
.options(table="kv", keyspace="test")\
.load().show()
Find out host IP address, eg, 123.123.123.123
123.123.123.123 ubuntu
sudo nano /etc/cassandra/cassandra.yaml
# listen_address: localhost
listen_interface: 123.123.123.123
# rpc_address: localhost
rpc_interface: 123.123.123.123
# seeds
- seeds: "123.123.123.123"
sudo service cassandra stop
sudo service cassandra start
nano conf/spark-defaults.conf
spark.cassandra.connection.host 123.123.123.123
nano conf/spark-env.sh
SPARK_MASTER_IP=123.123.123.123
sbin/start-all.sh
bin/pyspark --master spark://123.123.123.123:7077
df =sqlContext.read\
.format("org.apache.spark.sql.cassandra")\
.options(table="kv", keyspace="test")\
.load()
df.show()
nano /etc/cassandra/cassandra.yaml
authenticator: PasswordAuthenticator
authorizer: CassandraAuthorizer
sudo service cassandra stop
sudo service cassandra start
cqlsh 123.123.123.123 -u cassandra -p cassandra
CREATE USER jimmy WITH PASSWORD '????????' SUPERUSER;
ALTER USER cassandra WITH PASSWORD 'idontreallycarewhatpasswordthisisanymore';
bin/pyspark
sc.stop()
from pyspark import SparkConf
conf = SparkConf()\
.set("spark.cassandra.auth.username", "jimmy")\
.set("spark.cassandra.auth.password", "????????")\
.setAppName("SparkCassandra")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
df =sqlContext.read\
.format("org.apache.spark.sql.cassandra")\
.options(table="kv", keyspace="test")\
.load()
df.show()