Data Generation and Recommendation Engine Using Association Rules (Market Basket) in R

  2018-01-15


library(dplyr, warn.conflicts = F)
library(dbplyr, warn.conflicts = F)
df1 = readr::read_csv("brand.csv")
## Parsed with column specification:
## cols(
##   brand = col_character()
## )
df2 = readr::read_csv("product_category.csv")
## Parsed with column specification:
## cols(
##   product_category = col_character()
## )
df3 = readr::read_csv("version.csv")
## Parsed with column specification:
## cols(
##   version = col_character()
## )
df = merge(df1, df2, all = T) %>%
  merge(df3, all = T) %>%
  mutate( product_title = sprintf("%s %s %s", brand, product_category, version) %>% tolower ) 
writeLines(df$product_title, "product_title.txt")
cat schema.sql
## DROP TABLE IF EXISTS product CASCADE;
## CREATE TABLE product (  
##   product_id SERIAL PRIMARY KEY
##   , title TEXT NOT NULL
##   -- df product_title: word=product_title.txt
##   -- df: text=product_title length=1
## );
## DROP TABLE IF EXISTS purchase_order CASCADE;
## CREATE TABLE purchase_order (  --df: mult=0.1
##   purchase_order_id SERIAL PRIMARY KEY
## );
## DROP TABLE IF EXISTS order_line CASCADE;
## CREATE TABLE order_line (  --df: mult=0.5
##   product_id INTEGER NOT NULL REFERENCES product (product_id)
##   , purchase_order_id INTEGER NOT NULL REFERENCES purchase_order (purchase_order_id)
##   , amount INTEGER NOT NULL --df: offset=1 size=5
##   , PRIMARY KEY (product_id, purchase_order_id)
## );
datafiller --seed 1 --size=300 schema.sql > data.sql
cat data.sql | sed -n "/COPY/,+2 p"
## COPY product (product_id,title) FROM STDIN (ENCODING 'utf-8');
## 1    kia jant fk 4
## 2    dacia kaporta 392
## COPY purchase_order (purchase_order_id) FROM STDIN (ENCODING 'utf-8');
## 1
## 2
## COPY order_line (product_id,purchase_order_id,amount) FROM STDIN (ENCODING 'utf-8');
## 183  6   5
## 186  14  2

Now, we are going to start postgresql database server and import the generated data.

docker start postgreststarterkit_db_1
psql -d app -h localhost -p 5432 -U superuser -f init.sql
## psql:init.sql:1: NOTICE:  drop cascades to 3 other objects
## DETAIL:  drop cascades to table recommendation_engine.product
## drop cascades to table recommendation_engine.purchase_order
## drop cascades to table recommendation_engine.order_line
## DROP SCHEMA
## CREATE SCHEMA
## SET
## psql:schema.sql:1: NOTICE:  table "product" does not exist, skipping
## DROP TABLE
## CREATE TABLE
## psql:schema.sql:8: NOTICE:  table "purchase_order" does not exist, skipping
## DROP TABLE
## CREATE TABLE
## psql:schema.sql:12: NOTICE:  table "order_line" does not exist, skipping
## DROP TABLE
## CREATE TABLE
## # filling table product (300)
## COPY 300
## # filling table purchase_order (30)
## COPY 30
## # filling table order_line (150)
## COPY 150
## ALTER SEQUENCE
## ALTER SEQUENCE
## ANALYZE
## ANALYZE
## ANALYZE

Read data from PostgreSQL into R dataframe:

con2 <- DBI::dbConnect(RPostgreSQL::PostgreSQL()
  , user = Sys.getenv("SUPER_USER")
  , password = Sys.getenv("SUPER_USER_PASSWORD")
  , dbname = "app"
  , host = "localhost"
  , port = "5432"
  , options = "-c search_path=recommendation_engine"
)
prd = dplyr::tbl(con2, "product")
por = dplyr::tbl(con2, "purchase_order")
orl = dplyr::tbl(con2, "order_line")

Join the tables:

orl2 = orl %>%
  dplyr::inner_join(prd, by = "product_id") %>%
  dplyr::select(purchase_order_id, title, amount) %>%
  dplyr::arrange(purchase_order_id, title)
orl2
## # Source:     lazy query [?? x 3]
## # Database:   postgres 10.0.0 [superuser@localhost:5432/app]
## # Ordered by: purchase_order_id, title
##    purchase_order_id                 title amount
##                <int>                 <chr>  <int>
##  1                 1       citroen egsoz a      2
##  2                 1        kia filitre 20      4
##  3                 1     opel aynalar fk 4      5
##  4                 1        opel egsoz 160      1
##  5                 2    bmw silecekler 120      3
##  6                 2     bmw silecekler 39      1
##  7                 2 citroen yağlama 1.9 j      4
##  8                 2    honda şanzıman 120      2
##  9                 2          opel far 160      5
## 10                 2   twingo yakıt 3.2 jk      5
## # ... with more rows

Target shape requires to concatenate all products in one purchase order transaction together:

orl3 = orl2 %>%
  dplyr::collect() %>%
  dplyr::group_by(purchase_order_id) %>%
  dplyr::mutate(key = sprintf("x%s", row_number())) %>%
  dplyr::ungroup() %>%
  dplyr::mutate( row = row_number() ) 
orl3
## # A tibble: 150 x 5
##    purchase_order_id                 title amount   key   row
##                <int>                 <chr>  <int> <chr> <int>
##  1                 1       citroen egsoz a      2    x1     1
##  2                 1        kia filitre 20      4    x2     2
##  3                 1     opel aynalar fk 4      5    x3     3
##  4                 1        opel egsoz 160      1    x4     4
##  5                 2    bmw silecekler 120      3    x1     5
##  6                 2     bmw silecekler 39      1    x2     6
##  7                 2 citroen yağlama 1.9 j      4    x3     7
##  8                 2    honda şanzıman 120      2    x4     8
##  9                 2          opel far 160      5    x5     9
## 10                 2   twingo yakıt 3.2 jk      5    x6    10
## # ... with 140 more rows
orl4 = orl3 %>%
  dplyr::select(purchase_order_id, key, title) %>%
  tidyr::spread(key = key, value = title) 
orl4 
## # A tibble: 30 x 11
##    purchase_order_id                      x1   x10                     x2
##  *             <int>                   <chr> <chr>                  <chr>
##  1                 1         citroen egsoz a  <NA>         kia filitre 20
##  2                 2      bmw silecekler 120  <NA>      bmw silecekler 39
##  3                 3 honda göstergeler 1.5 j  <NA> kia yönlendirme 3.2 jk
##  4                 4 citroen göstergeler 392  <NA>      ford elektrik cax
##  5                 5         honda fren fk 4  <NA>  mercedes balans 1.5 j
##  6                 6         citroen far 392  <NA>       dacia aynalar 12
##  7                 7         dacia rot 1.9 j  <NA>                   <NA>
##  8                 8         citroen diğer a  <NA>       dacia aynalar 12
##  9                 9      citroen mekanik 20  <NA>   ford yönlendirme 160
## 10                10    ford yönlendirme 160  <NA>        opel aynalar 39
## # ... with 20 more rows, and 7 more variables: x3 <chr>, x4 <chr>,
## #   x5 <chr>, x6 <chr>, x7 <chr>, x8 <chr>, x9 <chr>
.libPaths("/Users/mertnuhoglu/.exploratory/R/3.4")
library(exploratory)
## Loading required package: broom
## Loading required package: urltools
## 
## Attaching package: 'exploratory'
## The following object is masked from 'package:urltools':
## 
##     url_parameters
# Data Analysis Steps
res = orl4 %>%
  dplyr::select(-purchase_order_id) %>%
  dplyr::mutate(transaction_id = row_number()) %>%
  tidyr::gather(key, product, -transaction_id, na.rm = TRUE, convert = TRUE) %>%
  dplyr::arrange(transaction_id) %>%
  exploratory::do_apriori(product, transaction_id, min_support = 0.0001) %>%
  dplyr::filter(support > 0.0004) %>%
  dplyr::group_by(rhs) %>%
  dplyr::top_n(3, lift)
res
## # A tibble: 9,966 x 5
## # Groups:   rhs [115]
##                      lhs                   rhs    support confidence  lift
##                    <chr>                 <chr>      <dbl>      <dbl> <dbl>
##  1       honda fren fk 4 mercedes balans 1.5 j 0.03333333        1.0    15
##  2 mercedes balans 1.5 j       honda fren fk 4 0.03333333        0.5    15
##  3     bmw mekanik 1.7 j      citroen fren 105 0.03333333        1.0    30
##  4      citroen fren 105     bmw mekanik 1.7 j 0.03333333        1.0    30
##  5     bmw mekanik 1.7 j       renault rot 1.3 0.03333333        1.0    30
##  6       renault rot 1.3     bmw mekanik 1.7 j 0.03333333        1.0    30
##  7          dacia aks 20        kia balans cax 0.03333333        1.0    30
##  8        kia balans cax          dacia aks 20 0.03333333        1.0    30
##  9          dacia aks 20         kia far 1.5 j 0.03333333        1.0    30
## 10         kia far 1.5 j          dacia aks 20 0.03333333        1.0    30
## # ... with 9,956 more rows
res = orl4 %>%
  dplyr::select(-purchase_order_id) %>%
  dplyr::mutate(transaction_id = row_number()) %>%
  tidyr::gather(key, product, -transaction_id, na.rm = TRUE, convert = TRUE) %>%
  dplyr::arrange(transaction_id) %>%
  exploratory::do_apriori(product, transaction_id, min_support = 0.0001) %>%
  dplyr::filter(support > 0.0004) %>%
  dplyr::group_by(rhs) %>%
  dplyr::top_n(3, lift)
res
## # A tibble: 9,966 x 5
## # Groups:   rhs [115]
##                      lhs                   rhs    support confidence  lift
##                    <chr>                 <chr>      <dbl>      <dbl> <dbl>
##  1       honda fren fk 4 mercedes balans 1.5 j 0.03333333        1.0    15
##  2 mercedes balans 1.5 j       honda fren fk 4 0.03333333        0.5    15
##  3     bmw mekanik 1.7 j      citroen fren 105 0.03333333        1.0    30
##  4      citroen fren 105     bmw mekanik 1.7 j 0.03333333        1.0    30
##  5     bmw mekanik 1.7 j       renault rot 1.3 0.03333333        1.0    30
##  6       renault rot 1.3     bmw mekanik 1.7 j 0.03333333        1.0    30
##  7          dacia aks 20        kia balans cax 0.03333333        1.0    30
##  8        kia balans cax          dacia aks 20 0.03333333        1.0    30
##  9          dacia aks 20         kia far 1.5 j 0.03333333        1.0    30
## 10         kia far 1.5 j          dacia aks 20 0.03333333        1.0    30
## # ... with 9,956 more rows