[Data Analysis] SQL Recipe for Data Analysis Tutorial #5 - Manipulating multiple tables

[Data Analysis] SQL Recipe for Data Analysis Tutorial #5 - Manipulating multiple tables

Goal

  • Studying SQL for data analysis
  • Practicing chapter 3rd in the book
  • Studying SQL for Manipulating data



Practice

Book for studying

Title 데이터 분석을 위한 SQL 레시피
Author Nagato Kasaki, Naoto Tamiya
Translator 윤인성
Publisher 한빛미디어
Chapter 3
Lesson 8


  • I’m gonna study basic SQL syntax and functions for data analysis
  • I’m gonna study how to manipulate multiple tables
  • I’m gonna study about data-intensive and data-processing
  • I’m gonna try to load test data on PostgreSQL and execute queries in the book


3.8. Manipulating multiple tables


3.8.1. Merge several tables horizontally

Create two user table

  • One of them is the table that stores information about users in application 1, while another table stores users in application 2
 -- DROP TABLE IF EXISTS app1_mst_users;
-- Create user in app1 table
CREATE TABLE app1_mst_users (
    user_id varchar(255)
  , name    varchar(255)
  , email   varchar(255)
);

-- Insert sample data
INSERT INTO app1_mst_users
VALUES
    ('U001', 'Dorbae', 'dorbae@example.com'  )
  , ('U002', 'Chang', 'chang@example.com')
;

-- DROP TABLE IF EXISTS app2_mst_users;
-- Create user in app2 table
CREATE TABLE app2_mst_users (
    user_id varchar(255)
  , name    varchar(255)
  , phone   varchar(255)
);

-- Insert sample data
INSERT INTO app2_mst_users
VALUES
    ('U001', 'Seong', '080-xxxx-xxxx')
  , ('U002', 'Bae', '070-xxxx-xxxx')
;

commit;

-- Select sample data
SELECT *
  FROM app1_mst_users
;

SELECT *
  FROM app2_mst_users
;

screenshot001

screenshot002


Merge tables

  • You can merge several table throug UNION ALL
  • UNION or UNION DISTINCT also merges multiple tables and in this case, it provides the same result. But, it spends more costs than UNION ALL
  • I’m gonna add app_name column for indentifying the app
-- Merge two tables
SELECT 'app1' AS app_name
     , user_id
     , name
     , email
  FROM app1_mst_users
UNION ALL
SELECT 'app1' AS app_name
     , user_id
     , name
     , phone -- or NULL
  FROM app2_mst_users
;

screenshot003


3.8.2. Merge several tables vertically

Create three tables

-- DROP TABLE IF EXISTS mst_categories;
CREATE TABLE mst_categories (
    category_id integer
  , name        varchar(255)
);

INSERT INTO mst_categories
VALUES
    (1, 'dvd' )
  , (2, 'cd'  )
  , (3, 'book')
;

-- DROP TABLE IF EXISTS category_sales;
CREATE TABLE category_sales (
    category_id integer
  , sales       integer
);

INSERT INTO category_sales
VALUES
    (1, 850000)
  , (2, 500000)
;

-- DROP TABLE IF EXISTS product_sale_ranking;
CREATE TABLE product_sale_ranking (
    category_id integer
  , rank        integer
  , product_id  varchar(255)
  , sales       integer
);

INSERT INTO product_sale_ranking
VALUES
    (1, 1, 'D001', 50000)
  , (1, 2, 'D002', 20000)
  , (1, 3, 'D003', 10000)
  , (2, 1, 'C001', 30000)
  , (2, 2, 'C002', 20000)
  , (2, 3, 'C003', 10000)
;

commit;

SELECT * FROM mst_categories;

SELECT * FROM category_sales;

SELECT * FROM product_sale_ranking;

screenshot004

screenshot005

screenshot006


Merge the tables through JOIN

  • You might loose some data because there are not matched data
-- Join tables
SELECT m.category_id
     , m.name
     , s.sales
     , r.product_id AS sale_product
  FROM mst_categories AS m
    JOIN category_sales AS s
      ON m.category_id = s.category_id
    JOIN product_sale_ranking AS r
      ON m.category_id = r.category_id
;

screenshot007


Merge the tables through LEFT JOIN

  • You can maintain the record count of category_sales(basis table)
-- Join tables through LEFT JOIN
SELECT m.category_id
     , m.name
     , s.sales
     , r.product_id AS sale_product
  FROM mst_categories AS m
    LEFT JOIN category_sales AS s
      ON m.category_id = s.category_id
    LEFT JOIN product_sale_ranking AS r
      ON m.category_id = r.category_id
        AND r.rank = 1
;

screenshot008


Merge the tables through Correlated Subquery without JOIN

  • It is possible to use Correlated Subquery when the Middleware supports it
-- Merge tables through Correlated Subquery
SELECT m.category_id
     , m.name
     -- Calculate sales amount byt categories using Correlated Subquery
     , (SELECT s.sales
          FROM category_sales AS s
         WHERE m.category_id = s.category_id
       ) AS sales
     -- Best sales product by categories
     -- Subquery has to return only one row
     , (SELECT r.product_id
          FROM product_sale_ranking AS r
         WHERE m.category_id = r.category_id
         ORDER BY sales DESC
         LIMIT 1
       ) AS top_sale_product
  FROM mst_categories AS m
;

screenshot009


3.8.3. Convert a conditional flag into 0 or 1

Create user with card number table and purchase log table

-- DROP TABLE IF EXISTS mst_users_with_card_number;
-- Create user with card number table
CREATE TABLE mst_users_with_card_number (
    user_id     varchar(255)
  , card_number varchar(255)
);

-- Insert sample data
INSERT INTO mst_users_with_card_number
VALUES
    ('U001', '1234-xxxx-xxxx-xxxx')
  , ('U002', NULL                 )
  , ('U003', '5678-xxxx-xxxx-xxxx')
;

-- DROP TABLE IF EXISTS purchase_log;
-- Create purchase log table
CREATE TABLE purchase_log2 (
    purchase_id integer
  , user_id     varchar(255)
  , amount      integer
  , stamp       varchar(255)
);

INSERT INTO purchase_log2
VALUES
    (10001, 'U001', 200, '2019-01-30 10:00:00')
  , (10002, 'U001', 500, '2019-02-10 10:00:00')
  , (10003, 'U001', 200, '2019-02-12 10:00:00')
  , (10004, 'U002', 800, '2019-03-01 10:00:00')
  , (10005, 'U002', 400, '2019-03-02 10:00:00')
;

commit;

-- Select sample data
SELECT * FROM mst_users_with_card_number;
SELECT * FROM purchase_log2;

screenshot010

screenshot011


Convert boolean flag about purchase or not into 0/1

  • You can maintain the same record count of mst_users_with_card_number by doing LEFT JOIN and GROUP BY user_id
  • And, you can ocnvert a flag into 0/1 using CASE or SIGN function
  • SIGN returns 0 if argument is 0 and returns 1 about a positive number argument and also returns -1 about a negative number argument
-- Convert boolean flag about purchase or not into 0/1
SELECT m.user_id
     , m.card_number
     , COUNT(p.user_id) AS purchase_count
     -- If a card number was registered, return 1 and return 0 about the opposite case
     , CASE WHEN m.card_number IS NOT NULL THEN 1 
            ELSE 0 
            END AS has_case
     -- If user has been ever purchased, return 1 and return 0 about the opposite case
     , SIGN(COUNT(p.user_id)) AS has_purchased
  FROM mst_users_with_card_number AS m
    LEFT JOIN purchase_log2 AS p
      ON m.user_id = p.user_id
  GROUP BY m.user_id, m.card_number
  ORDER BY m.user_id
;

screenshot012


3.8.4. CTE(Common Table Expression)

  • It has been introduced since SQL99
  • It enhances a readability of the code

Create table and insert sample data

--DROP TABLE IF EXISTS product_sales;
-- Create product slaes table
CREATE TABLE product_sales (
    category_name varchar(255)
  , product_id    varchar(255)
  , sales         integer
);

-- Insert sample data
INSERT INTO product_sales
VALUES
    ('dvd' , 'D001', 50000)
  , ('dvd' , 'D002', 20000)
  , ('dvd' , 'D003', 10000)
  , ('cd'  , 'C001', 30000)
  , ('cd'  , 'C002', 20000)
  , ('cd'  , 'C003', 10000)
  , ('book', 'B001', 20000)
  , ('book', 'B002', 15000)
  , ('book', 'B003', 10000)
  , ('book', 'B004',  5000)
;

commit;

-- Select sample data
SELECT * FROM product_sales;

screenshot013


Set name to the table added the ranks by categories

-- Set name to the table added the ranks by categories
WITH
product_sale_ranking AS (
  SELECT category_name
       , product_id
       , sales
       , ROW_NUMBER()
           OVER(PARTITION BY category_name ORDER BY sales DESC)
           AS rank
    FROM product_sales
)
SELECT *
  FROM product_sale_ranking
;

screenshot014


Extract the unique rankings from the ranking list by categories

-- Extract the unique rankings from the ranking list by categories
WITH
product_sale_ranking AS (
  SELECT category_name
       , product_id
       , sales
       , ROW_NUMBER()
           OVER(PARTITION BY category_name ORDER BY sales DESC)
           AS rank
    FROM product_sales
)
, mst_rank AS (
  SELECT DISTINCT rank
    FROM product_sale_ranking 
)
SELECT *
  FROM mst_rank
;

screenshot015


-- Print the ranking by categories
WITH
product_sale_ranking AS (
  SELECT category_name
       , product_id
       , sales
       , ROW_NUMBER()
           OVER(PARTITION BY category_name ORDER BY sales DESC)
           AS rank
    FROM product_sales
)
, mst_rank AS (
  SELECT DISTINCT rank
    FROM product_sale_ranking 
)
SELECT m.rank
     , r1.product_id AS vd
     , r1.sales AS dvd_sales
     , r2.product_id AS cd
     , r3.product_id AS book
     , r3.sales AS book_sales
  FROM mst_rank AS m
    LEFT JOIN product_sale_ranking AS r1
      ON m.rank = r1.rank
      AND r1.category_name = 'dvd'
    LEFT JOIN product_sale_ranking AS r2
      ON m.rank = r2.rank
      AND r2.category_name = 'cd'
    LEFT JOIN product_sale_ranking AS r3
      ON m.rank = r3.rank
      AND r3.category_name = 'book'
  ORDER BY m.rank
;

screenshot016


3.8.5. Make a resemblance table

  • It can enhance the productivity performance for test

Make a resemblance table with specific records

-- Make a resemblance table with specific records
WITH
mst_devices AS (
  SELECT 1 AS device_id, 'PC' AS device_name
  UNION ALL
  SELECT 2 AS device_id, 'SP' AS device_name
  UNION ALL
  SELECT 3 AS device_id, 'Application' AS device_name
)
SELECT *
  FROM mst_devices
;

UNION ALL might drop the performance whne there are a lot of records

screenshot017


Make a resemblance table with VALUE clause

  • In PostgreSQL, it is possible to make a resmeblance table using VALUES
-- Make a resemblance table with VALUE clause
WITH
mst_devices(device_id, device_name) AS (
  VALUES
      (1, 'PC')
    , (2, 'SP')
    , (3, 'Application')
 )
SELECT *
  FROM mst_devices
;

screenshot018


Make a resemblance table through structural table function

  • You can make a dynamic table with a structural function when it is impossible to use VALUE clause
  • For example, you can define a table through array function and explode function in Hive
-- Make a resemblance table through array, explode in Hive
WITH
mst_devces AS (
  -- Convert a array into columns
  SELECT d[0] AS device_id 
       , d[1] AS device_name
    FROM (SELECT explode (
                   array(
                       array('1', 'PC')
                     , array('2', 'SP')
                     , array('3', 'Application')
                   )
                 ) d
         ) AS t
)
SELECT *
  FROM mst_devices
;


Use map data type intead of array data type

-- Use map data type intead of array data type
WITH
mst_devces AS (
  SELECT d['device_id'] AS device_id 
       , d['device_name'] AS device_name
    FROM (SELECT explode (
                   array(
                       map('device_id', '1', 'device_name', 'PC')
                     , map('device_id', '2', 'device_name', 'SP')
                     , map('device_id', '3', 'device_name', 'Application')
                   )
                 ) d
         ) AS t
)
SELECT *
  FROM mst_devices
;


The function making sequence numbers

  • Some middleware support the function whixh makes sequence numbers
  • PostgreSQL supports generate_series function and BigQuery supports generate_array function
-- Make a resemblance table with sequence numbers
WITH 
series AS (
  -- Make from 1 to 5
  SELECT generate_series(1, 5) AS idx
  -- if BigQuery
  -- SELECT idx FROM unnst(generate_array(1, 5)) AS idx
)
SELECT *
  FROM series
;

screenshot019


Make a sequence numbers using repeat funxtion

  • Hive and SparkSQL don’t support the function like generate_series
  • Nevertheless, you cna make sequence numbers using repeat and explode functions
-- Make a resemblance table with sequence numbers through repeat
SELECT ROW_NUMBER()
         OVER(ORDER BY x) AS idx
  FROM 
       -- Make an array through repeat and split
       -- After that, spread it through explode
       (SELECT explode(split(repeat('x', 5 - 1), 'x')) AS x) AS t
;



References

  • 데이터 분석을 위한 SQL 레시피 - 한빛미디어

댓글남기기

-->