shantanuo/etl
https://www.kaggle.com/c/kkbox-churn-prediction-challenge/data
docker run -v /tmp/predict-customer-churn/data/:/tmp/ \
-e AWS_ACCESS_KEY_ID='xxx' \
-e AWS_SECRET_ACCESS_KEY='xxx' \
-e s3_path='s3://todel164' \
-e csv_path='/tmp/transactions.csv' \
shantanuo/etl python /home/process.py
CREATE EXTERNAL TABLE IF NOT EXISTS sampledb.todel14a (
`msno` string,
`is_churn` string
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
WITH SERDEPROPERTIES (
'serialization.format' = '1'
) LOCATION 's3://todel164/'
TBLPROPERTIES ('has_encrypted_data'='false');
split -C 1G user_logs.csv
The following shell script can be used to add the headers to each part and export parquet files to S3
#!/bin/sh
for file in `ls x*`
do
if [ "$file" == "xaa" ]
then
docker run -v /tmp/predict-customer-churn/data/:/tmp/ \
-e AWS_ACCESS_KEY_ID='xxx' \
-e AWS_SECRET_ACCESS_KEY='xxx' \
-e s3_path='s3://todel164' \
-e csv_path="/tmp/xaa" \
shantanuo/etl python /home/process.py
else
head -1 user_logs.csv > "$file"_temp
cat $file >> "$file"_temp
mv "$file"_temp $file
docker run -v /tmp/predict-customer-churn/data/:/tmp/ \
-e AWS_ACCESS_KEY_ID='xxx' \
-e AWS_SECRET_ACCESS_KEY='xxx' \
-e s3_path='s3://todel164' \
-e csv_path="/tmp/$file" \
shantanuo/etl python /home/process.py
fi
done
docker pull shantanuo/etl