숙제
Table | Fields |
session_timestamp | sessionId (string), ts (timestamp) |
user_session_channel | userid (integer), sessionid (string), channel (string) |
session_transaction | sessionid (string), refunded (boolean), amount (integer) |
channel | channelname (string) |
사용자 별로 처음 채널과 마지막 채널 알아내기
- ROW_NUMBER vs FIRST_VALUE / LAST_VALUE
아래의 코드는 userid가 251인 사용자의 channel을 확인할 수 있다.
SELECT ts, channel
FROM raw_data.user_session_channel usc
JOIN raw_data.session_timestamp st ON usc.sessionid = st.sessionid
WHERE userid = 251
ORDER BY 1;
문제 해결 (ROW_NUMBER)
CTE와 ROW_NUMBER를 사용하여 문제를 해결하였다. 혼자의 생각으로는 어려움이 있어 ChatGPT를 활용하여 해결하였다..ㅠ 올바르게 작동하지만, SELECT문 내에 두 개의 SELECT문이 포함돼 있어 작동 시간이 오래 걸린다. 이 방법은 강의에서 알아보려고 한다..
WITH numbered_channels AS (
SELECT
A.userId,
A.channel,
ROW_NUMBER() OVER (PARTITION BY A.userId ORDER BY B.ts) AS row_num_asc,
ROW_NUMBER() OVER (PARTITION BY A.userId ORDER BY B.ts DESC) AS row_num_desc
FROM raw_data.user_session_channel A
JOIN raw_data.session_timestamp B ON A.sessionId = B.sessionId
)
SELECT DISTINCT
userId,
(SELECT channel FROM numbered_channels WHERE userId = nc.userId AND row_num_asc = 1) AS first_channel,
(SELECT channel FROM numbered_channels WHERE userId = nc.userId AND row_num_desc = 1) AS last_channel
FROM numbered_channels nc
ORDER BY userId
LIMIT 10;
문제 해결 (FIRST_VALUE / LAST_VALUE)
FIRST_VALUE와 LAST_VALUE를 사용하여 문제를 해결하였다. "ROWS ~"부분은 윈도우 프레임을 정의한다. 서로 섞어 사용해도 결과는 동일하게 나온다.
SELECT DISTINCT
A.userId,
FIRST_VALUE(A.channel) OVER (partition by A.userId order by B.ts ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS first_channel,
FIRST_VALUE(A.channel) OVER (partition by A.userId order by B.ts DESC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS last_channel
FROM raw_data.user_session_channel A
JOIN raw_data.session_timestamp B ON A.sessionId = B.sessionId
ORDER BY 1
LIMIT 10;
SELECT DISTINCT
A.userId,
LAST_VALUE(A.channel) OVER (partition by A.userId order by B.ts DESC ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS first_channel,
LAST_VALUE(A.channel) OVER (partition by A.userId order by B.ts ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS last_channel
FROM raw_data.user_session_channel A
JOIN raw_data.session_timestamp B ON A.sessionId = B.sessionId
ORDER BY 1
LIMIT 10;
/*
userid first_channel last_channel
27 Youtube Instagram
29 Naver Naver
33 Google Youtube
34 Youtube Naver
36 Naver Youtube
40 Youtube Google
41 Facebook Youtube
44 Naver Instagram
45 Youtube Instagram
59 Instagram Instagram
*/
모범 답안 (1) - CTE (ROW_NUMBER)
WITH first AS (
SELECT userid, ts, channel, ROW_NUMBER() OVER(PARTITION BY userid ORDER BY ts) seq
FROM raw_data.user_session_channel usc
JOIN raw_data.session_timestamp st ON usc.sessionid = st.sessionid
), last AS (
SELECT userid, ts, channel, ROW_NUMBER() OVER(PARTITION BY userid ORDER BY ts DESC) seq
FROM raw_data.user_session_channel usc
JOIN raw_data.session_timestamp st ON usc.sessionid = st.sessionid
)
SELECT first.userid AS userid, first.channel AS first_channel, last.channel AS last_channel
FROM first
JOIN last ON first.userid = last.userid and last.seq = 1
WHERE first.seq = 1;
모범 답안 (2) - JOIN (ROW_NUMBER)
SELECT first.userid AS userid, first.channel AS first_channel, last.channel AS last_channel
FROM (
SELECT userid, ts, channel, ROW_NUMBER() OVER(PARTITION BY userid ORDER BY ts) seq
FROM raw_data.user_session_channel usc
JOIN raw_data.session_timestamp st ON usc.sessionid = st.sessionid
) first
JOIN (
SELECT userid, ts, channel, ROW_NUMBER() OVER(PARTITION BY userid ORDER BY ts DESC) seq
FROM raw_data.user_session_channel usc
JOIN raw_data.session_timestamp st ON usc.sessionid = st.sessionid
) last ON first.userid = last.userid and last.seq = 1
WHERE first.seq = 1;
모범 답안 (3) - GROUP BY (ROW_NUMBER)
SELECT userid,
MAX(CASE WHEN rn1 = 1 THEN channel END) first_touch,
MAX(CASE WHEN rn2 = 1 THEN channel END) last_touch
FROM (
SELECT userid,
channel,
(ROW_NUMBER() OVER (PARTITION BY usc.userid ORDER BY st.ts asc)) AS rn1,
(ROW_NUMBER() OVER (PARTITION BY usc.userid ORDER BY st.ts desc)) AS rn2
FROM raw_data.user_session_channel usc
JOIN raw_data.session_timestamp st ON usc.sessionid = st.sessionid
)
GROUP BY 1;
모범 답안 (4) (FIRST_VALUE / LAST_VALUE)
SELECT DISTINCT
A.userid,
FIRST_VALUE(A.channel) over(partition by A.userid order by B.ts
rows between unbounded preceding and unbounded following) AS First_Channel,
LAST_VALUE(A.channel) over(partition by A.userid order by B.ts
rows between unbounded preceding and unbounded following) AS Last_Channel
FROM raw_data.user_session_channel A
LEFT JOIN raw_data.session_timestamp B ON A.sessionid = B.sessionid;
GrossRevenue가 가장 큰 UserID 10개 찾기
user_session_channel과 session_transaction 테이블을 사용한다.
- GrossRevenue : Refunded를 포함한 매출
문제 해결
SELECT A.userId, SUM(B.amount) grossRevenue
FROM raw_data.user_session_channel A
LEFT JOIN raw_data.session_transaction B ON A.sessionId = B.sessionId
GROUP BY 1
ORDER BY 2 DESC
LIMIT 10;
/*
userid grossrevenue
989 743
772 556
1615 506
654 488
1651 463
973 438
262 422
1099 421
2682 414
891 412
*/
모범 답안 (SUM OVER)
GROUP BY를 사용한 방법은 위의 풀이와 동일하므로 적지 않았다.
SELECT DISTINCT
usc.userid,
SUM(amount) OVER(PARTITION BY usc.userid)
FROM raw_data.user_session_channel AS usc
JOIN raw_data.session_transaction AS revenue ON revenue.sessionid = usc.sessionid
ORDER BY 2 DESC
LIMIT 10;
raw_data.nps 테이블을 바탕으로 월별 NPS 계산
- 고객들이 0 (의향 없음)에서 10 (의향 아주 높음)
- detractor (비추천자) : 0 ~ 6점
- passive (소극자) : 7, 8점
- promoter (홍보자) : 9, 10점
- NPS = promoter 퍼센트 - detractor 퍼센트
문제 해결
SELECT
LEFT(created_at, 7) AS month,
ROUND(COUNT(CASE WHEN score >= 9 THEN id END) * 100.0 / COUNT(1), 2) AS promoter,
ROUND(COUNT(CASE WHEN score <= 6 THEN id END) * 100.0 / COUNT(1), 2) AS detractor,
promoter - detractor AS nps
FROM raw_data.nps
GROUP BY 1
ORDER BY 1;
/*
month promoter detractor nps
2019-01 50.39 48.03 2.36
2019-02 60.28 29.75 30.53
2019-03 72.23 19.33 52.90
2019-04 71.53 18.54 52.99
2019-05 72.03 17.51 54.52
2019-06 78.80 13.78 65.02
2019-07 77.93 13.41 64.52
2019-08 80.14 12.43 67.71
2019-09 65.00 27.05 37.95
2019-10 72.24 18.95 53.29
2019-11 76.83 15.54 61.29
2019-12 79.61 13.61 66.00
*/
모범 답안 (1)
SELECT month,
ROUND((promoters-detractors)::float/total_count*100, 2) AS overall_nps
FROM (
SELECT LEFT(created, 7) AS month,
COUNT(CASE WHEN score >= 9 THEN 1 END) AS promoters,
COUNT(CASE WHEN score <= 6 THEN 1 END) AS detractors,
COUNT(CASE WHEN score > 6 AND score < 9 THEN 1 END) As passives,
COUNT(1) AS total_count
FROM raw_data.nps
GROUP BY 1
ORDER BY 1
);
모범 답안 (2)
SELECT
LEFT(created, 7) AS month,
ROUND(SUM(CASE WHEN score >= 9 THEN 1 WHEN score <= 6 THEN -1 END)::float*100/COUNT(1), 2)
FROM raw_data.nps
GROUP BY 1
ORDER BY 1;
'[프로그래머스] 데이터 엔지니어링 데브코스 3기 > 숙제' 카테고리의 다른 글
[숙제 - 41일 차] 데이터 파이프라인 실습 코드 문제점 해결하기 (0) | 2024.05.20 |
---|---|
[숙제 - 33일 차] S3 -> Redshift, COPY 명령어로 데이터 적재하기 (2) (0) | 2024.05.09 |
[숙제 - 32일 차] S3 -> Redshift, COPY 명령어로 데이터 적재하기 (0) | 2024.05.09 |
[숙제 - 31일 차] 데브코스 ETL/ELT (0) | 2024.05.06 |
[숙제 - 23일 차] 데브코스 SQL 숙제 (0) | 2024.04.25 |