Running locally
scald.rb
sbt
Introduction
Ecosystem
Word count
import com.twitter.scalding._
class WordCountJob(args : Args) extends Job(args) {
TextLine(args("input")) // --input hdfs://... -> [line]
.read // input -> pipe
.flatMap('line -> 'word) { // [line] -> [line, word]
line: String => line.split("""\s+""")
}
.groupBy('word) { _.size } // [word, size]
.write(Tsv(args("output"))) // --output hdfs://...
}
Sources and sinks
// TSV with 4 fields
val pipe = Tsv(args("input"), ('user, 'email, 'country, 'product)).read
pipe.write(Tsv(args("output"))) // fields inferred from input
pipe.write(Tsv(args("output"), ('user, 'email))) // keep only 2
pipe1 ++ pipe2 // union 2 pipes that have the same fields
Working with fields
val pipe = Tsv(args("input1"), ('user, 'track, 'artist, 'time)).read
pipe.project('user, 'track) // keep these 2
pipe.discard('artist, 'time) // throw away these 2
// insert 2 constants to every tuple
pipe.insert(('alpha, 'beta), (0.02, 0.01))
// useful before join
pipe.rename(('user, 'item) -> ('userLHS, 'itemLHS))
Map and flatMap
val pipe = Tsv(args("input"), ('user, 'time, 'uri, 'msg)).read
// new field "userGroup" from "user"
pipe.map('user -> 'userGroup) { user: String => user.hashCode }
pipe
.project('uri, 'msg)
.flatMap('msg -> 'token) { msg: String => text.split("\\s+") }
/*
["spotify:track:J3P53n", "so call me maybe"] ->
["spotify:track:J3P53n", "so call me maybe", "so"]
["spotify:track:J3P53n", "so call me maybe", "call"]
["spotify:track:J3P53n", "so call me maybe", "me"]
["spotify:track:J3P53n", "so call me maybe", "maybe"] */
Use mapTo
and flatMapTo
when only new fields are needed
Filter
val pipe = Tsv(args("input"), ('user, 'time, 'uri, 'msg)).read
// keep tracks
pipe.filter('uri) { uri: String => uri.startsWith("spotify:track:") }
// get rid of spams
pipe.filterNot('user, 'msg) { fields: (String, String) =>
val (user, msg) = fields
user == "spammer.inc" || msg == "call me maybe"
}
See RichPipe
ScalaDoc for more operations
Group
val pipe = Tsv(args("input"), ('user, 'item, 'rating)).read
// groupBy(f: Fields)(builder: (GroupBuilder) => GroupBuilder): Pipe
pipe.groupBy('user) { // uri -> [[item, rating], [item, rating], ...]
_.size // [[item, rating], ...] -> Int
}
Group
val pipe = Tsv(args("input"), ('user, 'item, 'rating)).read
pipe.groupBy('user) { // user -> [[item, rating], [item, rating], ...]
-
.reducers(1000) // number of reducers, jbx will kill you!
.size('n) // [user, n]
.average('rating -> 'avgR) // [user, n, avgR]
.sum('rating -> 'sumR) // [user, n, avgR, sumR]
.max('rating -> 'maxR) // [user, n, avgR, sumR, maxR]
.min('rating -> 'minR) // [user, n, avgR, sumR, maxR, minR]
}
// [user, n, avgR, stdevR]
pipe.groupBy('user) { _.sizeAveStdev('rating -> ('n, 'avgR, 'stdevR)) }
Group reduce
val pipe = Tsv(args("input"), ('user, 'uri, 'duration)).read
pipe.groupBy('uri) {
// reduce function is commutative
// done on both mapper and reducer side
_.reduce('duration -> 'totalDuration) { (d1: Int, d2: Int) => d1 + d2 }
}
pipe.groupBy('user) {
// foldLeft(fields -> newFields)(init) { function }
// done only on reducer side
_.foldLeft('item -> 'uniqueItems)(Set[Int]()) {
(uris: Set[Int], uri: Int) => uris + uri
}
}
More group operations
def int2set(i: Int): Set[Int] = Set(i)
def setUnion(s1: Set[Int], s2: Set[Int]): Set[Int] = s1 ++ s2
def set2str(s: Set[Int]): String = s.mkString(":")
val pipe = Tsv(args("input"), ('user, 'uri, 'duration)).read
pipe.groupBy('user) {
// mapReduceMap(fields -> newFields)(mapFn1)(ReduceFn)(mapFn2)
// T, X, U -> original, intermediate, result type
// mapFn1: (T) => X, mapper side
// reduceFn: (X, X) => X, both mapper and reducer side
// mapFn2: (X) => U, reducer side
_.mapReduceMap('item -> 'uniqueItems)(int2set)(setUnion)(set2str)
}
See GroupBuilder
ScalaDoc for more operations
Joins
joinWithSmaller
- preferredjoinWithLarger
- reverse of joinWithSmaller
joinWithTiny
- entirely mapper sideval ratings = Tsv(args("ratings"), ('user, 'item, 'rating)).read')).read
val names = Tsv(args("names"), ('item, 'name)).read
ratings
.groupBy('item) { _.average('avgRating) } // [item, avgRating]
// (LHS fields -> RHS fields, RHS pipe)
.joinWithSmaller('item -> 'item, names) // [item, avgRating, name]
Functions on multiple fields
Tsv(args("input"), ('user, 'item, 'rating)).read
.map(('user, 'item) -> ('userGroup, 'itemType)) {
// (Tuple2[String, String]) => Tuple2[String, String]
fields: (String, String) =>
val (user, item) = fields
(user.hashCode, item.split(":")(1))
}
import com.twitter.scalding.FunctionImplicits._
Tsv(args("input"), ('user, 'item, 'rating)).read
.map(('user, 'item) -> ('userGroup, 'itemType)) {
// (Tuple2[String, String]) => Tuple2[String, String]
// implicitly converted to
// (String, String) => Tuple2[String, String]
(user: String, item: String) =>
(user.hashCode, item.split(":")(1))
}
Type safe API
TypedPipe[T]
Grouped[K, V]
Back-n-forth
Between Pipe
and TypedPipe[T]
// field-based input, 3 untyped fields
Tsv(args("input"), ('username, 'trackGid, 'count)).read
// convert to TypedPipe of Tuple3
.toTypedPipe[(String, String, Int)]('username, 'trackGid, 'count)
unsafe in, safe out
PackedAvroSource[EndSongCleaned](args("input"))
.map(e => (e.getUsername.toString, e.getTrackid.toString, e.getMsPlayed))
.toPipe('username, 'trackId, 'msPlayed)
safe in, unsafe out
TypedPipe[T]
map
, flatMap
& filter
work just like standard collectiongroupBy(fn: T => K)
→ Grouped[K, T]
TypedPipe[EndSongCleaned].groupBy(_.getUsername.toString)
Grouped[String, EndSongCleaned]
group
when T == (K, V)
→ Grouped[K, V]
TypedPipe[(String, Int)].grouped
→ Grouped[String, Int]
groupAll
→ Grouped[Unit, T]
TypedPipe[T].groupAll
→ Grouped[Unit, T]
Grouped[K, V]
Grouped[K, W]
CoGrouped[K, (V, W)]
withReducers(reds: Int)
TypedPipe[(K, V)]
when done:Grouped[K, V]
value reduction operations
reduce(fn: (V, V) => V)
values n-to-1foldLeft(fn: (B, V) => B)
values n-to-1max
, min
, size
, sum
, product
, etc. also n-to-1count(fn: V => Boolean)
forall(fn: V => Boolean)
Grouped[K, V]
value n-to-m operations
mapValues(fn: V => U)
map values, n-to-nmapValueStream(fn: Iterator[V] => Iterator[U])
mapGroup(fn: (K, Iterator[V]) => Iterator[U])
take
, takeWhile
, drop
, dropWhile
, and sort*
Type safe word count
package com.spotify.scalding.tutorial
import com.twitter.scalding._
import TDsl._
class Tutorial1(args: Args) extends Job(args) {
val input = TypedTsv[String](args("input")) // TypedPipe[String]
.filter(_ != null) // TypedPipe[String], fewer
.flatMap(_.split("""\s+""")) // TypedPipe[String], more
.groupBy(identity) // Grouped[String, String]
.size // UnsortedGroup[String, Long]
.toTypedPipe // TypedPipe[(String, Long)]
.write(TypedTsv[(String, Long)](args("output")))
}
Introduction
Word count
import org.apache.spark._
import org.apache.spark.SparkContext._
object WordCount {
def main(args: Array[String]) {
// args(0) is master, local or yarn-standalone
val sc = new SparkContext(args(0), "Tutorial0") // one context per job
sc.textFile(args(1)) // local/HDFS input, RDD[String]
.flatMap { line => line.split("""\s+""") } // RDD[String]
.map(word => (word, 1)) // RDD[(String, Int)]
.reduceByKey(_ + _) // RDD[(String, Int)], fewer items
.saveAsTextFile(args(2)) // local/HDFS output
}
}
RDD API
RDD
- Resilient Distributed DatasetRDD[Double]
- DoubleRDDFunctions
RDD[K, V]
- PairRDDFunctions
Workflow
main
runs sequentially on master (driver)RDD
→ RDD
): in parallel on executorsRDD
→ local value): executors → drivermap
, flatMap
, filter
, ...)reduceByKey
, groupByKey
, ...)import org.apache.spark._
import org.apache.spark.SparkContext._
import org.apache.spark.mllib.recommendation.{Rating, ALS}
object ImplicitALS {
def main(args: Array[String]) {
val sc = new SparkContext(args(0), "ImplicitALS")
val ratings = sc.textFile(args(1)).map { l: String =>
val t = l.split('\t')
Rating(t(0).toInt, t(1).toInt, t(2).toFloat)
}
ALS.trainImplicit(ratings, 40, 20, 0.8, 0.2)
.productFeatures
.map { case (id, vec) => id + "\t" + vec.mkString(" ") }
.saveAsTextFile(args(2))
}
}
Performance tuning
Test data
projects/data/user-artist-1k
projects/data/id-to-name
Options
id-to-name
Spark
Further reading