Public | Automated Build

Last pushed: a year ago
Short Description
SparkWorkshop
Full Description

SparkWorkshop

docker run             \
  --rm                 \
  --name=sparkworkshop \
  -p 8888:8888         \
  -p 4040:4040         \
  -it dserban/sparkworkshop
Docker Pull Command
Owner
dserban
Source Repository

Comments (1)
lucianmol
a year ago

Thank you once again, Dan, for the workshop!
Hope to meet you again soon.

Next is my solution to the exercise:

#1. get mapping tuples (country_id, continent_id)
ctr_cont_map_rdd = raw_countries_rdd \
.map( lambda s: s.split(',') ) \
.map( lambda l: ( int(l[0]), int(l[1]) ) )

ctr_cont_map_rdd.take(10)

#2. get tuples (country_id, (city_initial, list_of_cities))
ctr_cit_list_rdd = cities_rdd \
.reduceByKey(add) \
.map(lambda ((ctr_id, first_ltr), cit_list): (ctr_id, (first_ltr, cit_list)))

ctr_cit_list_rdd.take(10)

#3. get tuples (continent_id, list_of_cities) by city_initial
cont_cit_list_rdd = ctr_cit_list_rdd \
.join(ctr_cont_map_rdd) \
.map(lambda (ctr_id, ((first_ltr, cit_list), cont_id)): ((cont_id, first_ltr), cit_list)) \
.reduceByKey(add) \
.map(lambda ((cont_id, first_ltr), cit_list): (cont_id, cit_list))

cont_cit_list_rdd.take(10)

#4. sort in descending number of cities with similar initial (first letter in name)

#replace continent_id with continent_name

#add city count

#filter pairs where the number of cities in group is > 2

#get tuple (continent_name, city_initial, list_of_cities)
nice_cont_cit_list = cont_cit_list_rdd \
.join(continents_rdd) \
.map(lambda (cont_id, (cit_list, cont_name)): (cont_name, cit_list)) \
.sortBy(lambda (cont_name, cit_list): len(cit_list), ascending=False ) \
.map(lambda (cont_name, cit_list): (cont_name, len(cit_list), cit_list)) \
.filter(lambda (cont_name, num_cit, cit_list): num_cit > 2)

nice_cont_cit_list.collect()