(ADV403) Dynamic Ad Perf. Reporting w/ Redshift: Data Science, Queries at Scale | AWS re:Invent 2014

November 13, 2014 | Las Vegas, NV

Timon Karnezos, Director Infrastructure, Neustar

Vidhya Srinivasan, Sr. Manager Software Development, Amazon Redshift

Petabyte scale

Massively parallel

Relational data warehouse

Fully managed; zero admin

10 GigE

(HPC)

IngestionBackupRestore

JDBC/ODBC

Ad Tech

Use Cases

692.8s

34.9s

< 0.76%

00 01 10 11

00

01

10

11

P

Space filling Curve for Two Dimensions

Frequency

Attribution

Overlap

Ad-hoc

0.7B / day

2B / week

8B / month

21B / quarter

-- Number of ads seen per user

WITH frequency_intermediate AS (

SELECT user_id ,

SUM(1) AS impression_count,

SUM(cost) AS cost ,

SUM(revenue) AS revenue

FROM impressions

WHERE record_date BETWEEN <...>

GROUP BY 1

)

-- Number of people who saw N ads

SELECT impression_count, SUM(1), SUM(cost), SUM(revenue)

FROM frequency_intermediate

GROUP BY 1;

CREATE TABLE (

record_date date ENCODE NOT NULL ,

campaign_id bigint ENCODE NOT NULL ,

site_id bigint ENCODE NOT NULL ,

user_id bigint ENCODE NOT NULL DISTKEY,

impression_count int ENCODE NOT NULL ,

cost bigint ENCODE NOT NULL ,

revenue bigint ENCODE NOT NULL

) SORTKEY( , , , );

WITH user_frequency AS (

SELECT user_id, campaign_id, site_id,

SUM(impression_count) AS frequency,

SUM(cost) AS cost ,

SUM(revenue) AS revenue

FROM frequency_intermediate

WHERE record_date BETWEEN <...>

GROUP BY 1,2,3

)

SELECT campaign_id, site_id, frequency,

SUM(1), SUM(cost), SUM(revenue)

FROM user_frequency

GROUP BY 1,2,3;

-- Basic sessionization query, assemble user activity

-- that ended in a conversion into a timeline.

SELECT <...>

FROM impressions i

JOIN conversions c ON

i.user_id = c.user_id AND

i.record_date < c.record_date

ORDER BY i.record_date;

Position: 1

Position: 2

Position: 3

Hour offset: 3

Position: 1

Position: 2

Hour offset: 12

Hour offset: 16

Position: 3

-- Sessionize user activity per conversion, partition by campaign (45-day lookback window)

SELECT c.record_date AS conversion_date ,

c.event_id AS conversion_id ,

i.campaign_id AS campaign_id ,

i.site_id AS site_id ,

i.user_id AS user_id ,

c.revenue AS conversion_revenue,

DATEDIFF('hour', i.record_date, c.record_date) AS hour_offset,

SUM(1) OVER (PARTITION BY i.user_id, i.campaign_id, c.event_id

ORDER BY i.record_date DESC ROWS UNBOUNDED PRECEDING) AS position

FROM impressions i

JOIN conversions c ON

i.user_id = c.user_id AND

i.campaign_id = c.campaign_id AND

i.record_date < c.record_date AND

i.record_date > (c.record_date - interval '45 days') AND

c.record_date BETWEEN <...>;

-- Compute statistics on sessions (funnel placement, last-touch, site-count, etc...)

SELECT campaign_id ,

site_id ,

conversion_date,

AVG(position) AS average_position,

SUM(conversion_revenue * (position = 1)::int) AS lta_attributed ,

AVG(COUNT(DISTINCT site_id)

OVER (PARTITION BY i.user_id, i.campaign_id, c.event_id

ORDER BY i.record_date ASC

ROWS UNBOUNDED PRECEDING)) AS average_unique_preceding_site_count

FROM sessions

GROUP BY 1,2,3;

Site A Site B Site

C

Site A 20% 60%

Site B 90%

Site

C

CPM $0.06 $1.05 $9.50

Site A Site B Site

C

Site A 20% 60%

Site B 90%

Site

C

CPM $0.06 $1.05 $9.50

Site A Site B Site

C

Site A 20% 60%

Site B 90%

Site

C

CPM $0.06 $1.05 $9.50

CREATE TABLE (

user_id bigint ENCODE NOT NULL DISTKEY,

site_id bigint ENCODE NOT NULL

) SORTKEY ( );

WITH co_occurences AS (

SELECT

oi.site_id AS site1 ,

oi2.site_id AS site2

FROM overlap_intermediate oi

JOIN overlap_intermediate oi2 ON

oi.site_id > oi2.site_id AND

oi.ak_user_id = oi2.ak_user_id

)

SELECT site1, site2, SUM(1)

FROM co_occurences

GROUP BY 1,2;

CREATE TABLE (

record_date date ENCODE NOT NULL ,

campaign_id bigint ENCODE NOT NULL ,

site_id bigint ENCODE NOT NULL ,

user_id bigint ENCODE NOT NULL DISTKEY

) SORTKEY ( , );

WITH

site_overlap_intermediate AS (

SELECT user_id, site_id, campaign_id

FROM overlap_intermediate WHERE record_date BETWEEN <...> GROUP BY 1,2,3

),

site_co_occurences AS (

SELECT oi.campaign_id AS c_id, oi.site_id AS site1, oi2.site_id AS site2

FROM site_overlap_intermediate oi

JOIN site_overlap_intermediate oi2 ON

oi.site_id > oi2.site_id AND

oi.ak_user_id = oi2.ak_user_id AND

oi.campaign_id = oi2.campaign_id

)

SELECT c_id, site1, site2, SUM(1) FROM site_co_occurences GROUP BY 1,2,3;

8 fact tables

26 dimension tables

7 mapping tables

42 views

121 joins

1100 sloc

$ pg_dump –Fc some_file --table=foo --table=bar

$ pg_restore --schema-only --clean –Fc some_file > schema.sql

$ pg_restore --data-only --table=foo –Fc some_file > foo.tsv

$ aws s3 cp schema.sql s3://metadata-bucket/YYYYMMDD/schema.sql

$ aws s3 cp foo.tsv s3://metadata-bucket/YYYYMMDD/foo.tsv

> \i schema.sql

> COPY foo FROM ‘s3://metadata-bucket/YYYYMMDD/foo.tsv’ <...>

# or combine ‘COPY <..> FROM <...> SSH’ and pg_restore/psql

UNLOAD

('

SELECT i.*

FROM impressions i

JOIN client_to_campaign_mapping m ON

m.campaign_id = i.campaign_id

WHERE i.record_date >= '{{yyyy}}-{{mm}}-{{dd}}' - interval \'1 day\' AND

i.record_date < '{{yyyy}}-{{mm}}-{{dd}}' AND

m.client_id = <...>

‘)

TO 's3://{{bucket}}/us_eastern/{{yyyy}}/{{mm}}/{{dd}}/dsdk_events/{{vers}}/impressions/'

WITH CREDENTIALS 'aws_access_key_id={{key}};aws_secret_access_key={{secret}}'

DELIMITER ',' NULL '\\N' ADDQUOTES ESCAPE GZIP MANIFEST;

Workload Node Count Node Type Restore Maint. Exec.

Frequency

& Attribution

& Overlap

& Ad Hoc

16 dw2.8xlarge 2h 1h 6h

= $691.20

Workload Node Count Node Type Restore Maint. Exec.

Frequency 8 dw2.8xlarge 1.5h 0.5h 2.5h

Attribution 8 dw2.8xlarge 1.5h 0.5h 2h

Overlap 8 dw2.8xlarge 1h 0.5h 2.5h

Ad-hoc 8 dw2.8xlarge 0h 0.5h 1.5h

= $556.80 (-19%)

http://bit.ly/awsevals

http://bit.ly/awsevals

Technology

(ADV403) Dynamic Ad Perf. Reporting w/ Redshift: Data Science, Queries at Scale | AWS re:Invent 2014