[Data Analysis] SQL Recipe for Data Analysis Tutorial #6 - Aggregating data based on the time series

[Data Analysis] SQL Recipe for Data Analysis Tutorial #6 - Aggregating data based on the time series

Goal

  • Studying SQL for data analysis
  • Practicing chapter 4th in the book
  • Studying SQL for Manipulating data



Practice

Book for studying

Title 데이터 분석을 위한 SQL 레시피
Author Nagato Kasaki, Naoto Tamiya
Translator 윤인성
Publisher 한빛미디어
Chapter 4
Lesson 9


  • I’m gonna study basic SQL syntax and functions for data analysis
  • I’m gonna study how to extract data for sales
  • I’m gonna study how to aggregate data based on the time series
  • I’m gonna try to load test data on PostgreSQL and execute queries in the book


4.9. Aggregating data based on the time series


4.9.1. Aggregate sales amount by date

Create purchase log table

-- DROP TABLE IF EXISTS purchase_log;
-- Create purchase log table
CREATE TABLE purchase_log3 (
    dt              varchar(255)
  , order_id        integer
  , user_id         varchar(255)
  , purchase_amount integer
);

-- Insert sample data
INSERT INTO purchase_log3
VALUES
    ('2020-01-01',  1, 'rhwpvvitou', 13900)
  , ('2020-01-01',  2, 'hqnwoamzic', 10616)
  , ('2020-01-02',  3, 'tzlmqryunr', 21156)
  , ('2020-01-02',  4, 'wkmqqwbyai', 14893)
  , ('2020-01-03',  5, 'ciecbedwbq', 13054)
  , ('2020-01-03',  6, 'svgnbqsagx', 24384)
  , ('2020-01-03',  7, 'dfgqftdocu', 15591)
  , ('2020-01-04',  8, 'sbgqlzkvyn',  3025)
  , ('2020-01-04',  9, 'lbedmngbol', 24215)
  , ('2020-01-04', 10, 'itlvssbsgx',  2059)
  , ('2020-01-05', 11, 'jqcmmguhik',  4235)
  , ('2020-01-05', 12, 'jgotcrfeyn', 28013)
  , ('2020-01-05', 13, 'pgeojzoshx', 16008)
  , ('2020-01-06', 14, 'msjberhxnx',  1980)
  , ('2020-01-06', 15, 'tlhbolohte', 23494)
  , ('2020-01-06', 16, 'gbchhkcotf',  3966)
  , ('2020-01-07', 17, 'zfmbpvpzvu', 28159)
  , ('2020-01-07', 18, 'yauwzpaxtx',  8715)
  , ('2020-01-07', 19, 'uyqboqfgex', 10805)
  , ('2020-01-08', 20, 'hiqdkrzcpq',  3462)
  , ('2020-01-08', 21, 'zosbvlylpv', 13999)
  , ('2020-01-08', 22, 'bwfbchzgnl',  2299)
  , ('2020-01-09', 23, 'zzgauelgrt', 16475)
  , ('2020-01-09', 24, 'qrzfcwecge',  6469)
  , ('2020-01-10', 25, 'njbpsrvvcq', 16584)
  , ('2020-01-10', 26, 'cyxfgumkst', 11339)
;

commit;

-- Select sample data
SELECT * FROM purchase_log3;

screenshot001


Aggregate sales amount and the average purchase amount per day

  • Imagine that you have to make the below chart

screenshot002


-- Aggregate sales amount and the average purchase amount per day
SELECT dt
     , COUNT(*) AS purchase_count
     , SUM(purchase_amount) AS total_amount
     , AVG(purchase_amount) AS avg_amount
  FROM purchase_log3
 GROUP BY dt
 ORDER BY dt ASC
;

screenshot003


4.9.2. Extract the change followed by date through a moving average

  • You can easily recognize the increase and decrease chage through the moving average
  • Imagine that you have to make the below chart

screenshot004


-- Extract the change followed by date through a moving average
SELECT dt 
     , SUM(purchase_amount) AS total_amount
     -- Caculate the recent average amount for 7 days
     -- Get the average amount even if there are less 7 days
     , AVG(SUM(purchase_amount))
         OVER(ORDER BY dt ROWS BETWEEN 6 PRECEDING AND CURRENT ROW)
         AS seven_day_avg
     -- Calculate the recent average amount for 7 days exactly
     -- Ignore the average amount which has less 7 days
     , CASE WHEN 7 = COUNT(*)
                      OVER(ORDER BY dt ROWS BETWEEN 6 PRECEDING AND CURRENT ROW)
              THEN AVG(SUM(purchase_amount))
                    OVER(ORDER BY dt ROWS BETWEEN 6 PRECEDING AND CURRENT ROW)
            END AS seven_day_avg_strict
  FROM purchase_log3
 GROUP BY dt 
 ORDER BY dt ASC
;

screenshot005


4.9.3. Calculate the accumulated sales amount of this month

  • Imagine that you have to make the below chart

screenshot006


-- Calculate the accumulated sales amount of this month
SELECT dt
     -- Extract year-month
     -- if PostgreSQL, Hive, SparkSQL, Redshift
     , substring(dt, 1, 7) AS year_month
     -- if PostgreSQL, Hive, SparkSQL, BigQuery
     -- , substr(dt, 1, 7) AS year_month
     , SUM(purchase_amount) AS total_amount
     , SUM(SUM(purchase_amount))
         OVER(PARTITION BY substring(dt, 1, 7) 
              ORDER BY dt 
              ROWS UNBOUNDED PRECEDING)
         AS agg_amount
  FROM purchase_log3
 GROUP BY dt
 ORDER BY dt
;

screenshot007


Enhance the readability of query through WITH clause

  • substring functions were used repeatly and it ruined the code readablity
  • You can make it better by WITH

In BigData analysis, it is usually more important to make the query distinctly than the performance

-- Make a temporary view indicating daily sales amount
WITH
daily_purchase AS (
  SELECT dt
       -- Extract year, month, date respectively
       -- if PostgreSQL, Hive, SparkSQL, Redshift, use substring
       -- if BigQuery, use substr
       , substring(dt, 1, 4) AS year
       , substring(dt, 6, 2) AS month
       , substring(dt, 9, 2) AS date
       , SUM(purchase_amount) AS purchase_amount
       , COUNT(order_id) AS orders
    FROM purchase_log3
   GROUP BY dt
)
SELECT *
  FROM daily_purchase
 ORDER BY dt 
;

screenshot008


-- Calculate the accumulated sales amount of this month through WITH
WITH
daily_purchase AS (
  SELECT dt
       -- Extract year, month, date respectively
       -- if PostgreSQL, Hive, SparkSQL, Redshift, use substring
       -- if BigQuery, use substr
       , substring(dt, 1, 4) AS year
       , substring(dt, 6, 2) AS month
       , substring(dt, 9, 2) AS date
       , SUM(purchase_amount) AS purchase_amount
       , COUNT(order_id) AS orders
    FROM purchase_log3
   GROUP BY dt
)
SELECT dt
     , concat(year, '-', month) AS year_month
     -- if Redshift, concat or ||
     -- year || '-' || month AS year_month
     , purchase_amount
     , SUM(purchase_amount)
         OVER(PARTITION BY year, month ORDER BY dt ROWS UNBOUNDED PRECEDING)
         AS agg_amount
  FROM daily_purchase 
 ORDER BY dt
;

screenshot009


4.9.4. Calculate the sales per month and compare to the one of last year

Create purchase log table

  • Insert sample data which contains sale amount from 2018 to 2019
-- DROP TABLE IF EXISTS purchase_log4;
CREATE TABLE purchase_log4 (
    dt              varchar(255)
  , order_id        integer
  , user_id         varchar(255)
  , purchase_amount integer
);

-- Insert sample data
INSERT INTO purchase_log4
VALUES
    ('2018-01-01',    1, 'rhwpvvitou', 13900)
  , ('2018-02-08',   95, 'chtanrqtzj', 28469)
  , ('2018-03-09',  168, 'bcqgtwxdgq', 18899)
  , ('2018-04-11',  250, 'kdjyplrxtk', 12394)
  , ('2018-05-11',  325, 'pgnjnnapsc',  2282)
  , ('2018-06-12',  400, 'iztgctnnlh', 10180)
  , ('2018-07-11',  475, 'eucjmxvjkj',  4027)
  , ('2018-08-10',  550, 'fqwvlvndef',  6243)
  , ('2018-09-10',  625, 'mhwhxfxrxq',  3832)
  , ('2018-10-11',  700, 'wyrgiyvaia',  6716)
  , ('2018-11-10',  775, 'cwpdvmhhwh', 16444)
  , ('2018-12-10',  850, 'eqeaqvixkf', 29199)
  , ('2019-01-09',  925, 'efmclayfnr', 22111)
  , ('2019-02-10', 1000, 'qnebafrkco', 11965)
  , ('2019-03-12', 1075, 'gsvqniykgx', 20215)
  , ('2019-04-12', 1150, 'ayzvjvnocm', 11792)
  , ('2019-05-13', 1225, 'knhevkibbp', 18087)
  , ('2019-06-10', 1291, 'wxhxmzqxuw', 18859)
  , ('2019-07-10', 1366, 'krrcpumtzb', 14919)
  , ('2019-08-08', 1441, 'lpglkecvsl', 12906)
  , ('2019-09-07', 1516, 'mgtlsfgfbj',  5696)
  , ('2019-10-07', 1591, 'trgjscaajt', 13398)
  , ('2019-11-06', 1666, 'ccfbjyeqrb',  6213)
  , ('2019-12-05', 1741, 'onooskbtzp', 26024)
;

commit;

-- Select sample data
SELECT * FROM purchase_log4;

screenshot010


Calculate the sales per month and compare to the one of last year

  • Imagine that you have to make the below chart

screenshot011


-- Calculate the sales per month and compare to the one of last year
WITH
daily_purchase AS (
  SELECT dt
       -- Extract year, month, date respectively
       -- if PostgreSQL, Hive, SparkSQL, Redshift, use substring
       -- if BigQuery, use substr
       , substring(dt, 1, 4) AS year
       , substring(dt, 6, 2) AS month
       , substring(dt, 9, 2) AS date
       , SUM(purchase_amount) AS purchase_amount
       , COUNT(order_id) AS orders
    FROM purchase_log4
   GROUP BY dt
)
SELECT month
     , SUM(CASE year WHEN '2018' THEN purchase_amount END) AS amount_2018
     , SUM(CASE year WHEN '2019' THEN purchase_amount END) AS amount_2019
     , 100.0 * SUM(CASE year WHEN '2019' THEN purchase_amount END)
       / SUM(CASE year WHEN '2018' THEN purchase_amount END)
         AS rate
  FROM daily_purchase 
 GROUP BY month 
 ORDER BY month
;

screenshot012


4.9.5. Utilize Z chart

  • Z chart
    • A diagram charting values over a period (frequently one year) and showing simultaneously monthly figures (or weekly or daily), cumulative totals and the moving averages.
    • It normally takes a Z shape, hence its title and is of use in clarifying the trends present in the data displayed

screenshot013


What factors are needed for Z chart

  • Monthly figure
    • The sum amount of that month
    • It can be weekly or daily

screenshot014


  • Cumulative total
    • Cumulative total from the first month of this year to current month

screenshot015


  • Moving total
    • Total amount from 11 month ago to current

screenshot016


How to analyze Z chart

  • Noticeable point on the cumulative total
    • The line for the cumulative total figure will be more straight as the monthly figures are regular

    screenshot017


    • The lean of the line for the cumulative total figure will become steep as the monthly figures are getting bigger

    screenshot018


    • The lean of the line for the cumulative total figure will be flatting off as the monthly figures are getting smaller

    screenshot019


  • Noticeable point on the moving total
    • The line for the moving total will be straight when the sales amount is regular from last year to this year

    screenshot020


    • The graph will rise up when there is an increasing trend for sales amount

    screenshot021


    • The graph will drop when there is a decreasing trend for sales amount

    screenshot022


  • Various shapes of Z chart

screenshot023

screenshot024

screenshot025


  • How to aggregate data for Z chart throug SQL
--- Z chart for sales in 2019
WITH
daily_purchase AS (
  SELECT dt
       -- Extract year, month, date respectively
       -- if PostgreSQL, Hive, SparkSQL, Redshift, use substring
       -- if BigQuery, use substr
       , substring(dt, 1, 4) AS year
       , substring(dt, 6, 2) AS month
       , SUM(purchase_amount) AS purchase_amount
       , COUNT(order_id) AS orders
    FROM purchase_log4
   GROUP BY dt
)
, monthly_purchase AS (
  -- Monthly figure
  SELECT year 
       , month
       , SUM(purchase_amount) AS amount
    FROM daily_purchase 
   GROUP BY year, month 
)
, calc_index AS (
  SELECT year 
       , month
       , amount
       -- cumulative total
       , SUM(CASE WHEN year = '2019' THEN amount END)
           OVER(ORDER BY year, month ROWS UNBOUNDED PRECEDING)
           AS agg_amount
       -- Moving total
       , SUM(amount)
           OVER(ORDER BY year, month ROWS BETWEEN 11 PRECEDING AND CURRENT ROW)
           AS year_avg_amount
    FROM monthly_purchase
   ORDER BY year, month 
)
-- Select data for 2019
SELECT concat(year, '-', month) AS year_month
     , amount 
     , agg_amount 
     , year_avg_amount 
  FROM calc_index
 WHERE year = '2019'
 ORDER BY year_month
;

screenshot026


4.9.6. The significant point for analyzing sales

  • You have to turn to data about surrounding factors like purchase count, a unit cost to understand the reason why there are changes on sales
-- Aggregate data related with sales
WITH
daily_purchase AS (
  SELECT dt
       -- Extract year, month, date respectively
       -- if PostgreSQL, Hive, SparkSQL, Redshift, use substring
       -- if BigQuery, use substr
       , substring(dt, 1, 4) AS year
       , substring(dt, 6, 2) AS month
       , SUM(purchase_amount) AS purchase_amount
       , COUNT(order_id) AS orders
    FROM purchase_log4
   GROUP BY dt
)
, monthly_purchase AS (
  -- Monthly figure
  SELECT year 
       , month
       , SUM(orders) AS orders
       , AVG(purchase_amount) AS avg_amount
       , SUM(purchase_amount) AS monthly
    FROM daily_purchase 
   GROUP BY year, month 
)
SELECT concat(year, '-', month) AS year_month
     , orders
     , avg_amount 
     , monthly 
     , SUM(monthly)
         OVER(PARTITION BY year 
              ORDER BY monthly 
              ROWS UNBOUNDED preceding)
         AS agg_amount
     -- sales amount 12 months ago
     , LAG(monthly, 12)
         OVER(ORDER BY year, month)
         -- if SparkSQL
         -- OVER(ORDER BY year, month
         --      ROWS BETWEEN 12 PRECEDING AND 12 PRECEDING)
         AS last_year
     -- The rate compared to sales amount 12m ago
     , 100.0 * monthly / LAG(monthly, 12)
                           OVER(ORDER BY year, month)
         AS rate
  FROM monthly_purchase
 ORDER BY year_month
;

screenshot027



References

댓글남기기

-->