Managing Cohort Object

When working with already defined cohort, you may want to manipulate its configuration (i.e. filter value) without the need to create the cohort from scratch.

cohortBuilder offers various methods that perform common Cohort management operations.

To present the functionality we’ll be working on the below librarian_cohort object:

librarian_source <- set_source(
  as.tblist(librarian)
)

librarian_cohort <- librarian_source %>% 
  cohort(
    step(
      filter(
        "discrete", id = "author", dataset = "books", 
        variable = "author", value = "Dan Brown"
      ),
      filter(
        "discrete", id = "program", dataset = "borrowers", 
        variable = "program", value = "premium", keep_na = FALSE
      )
    ),
    step(
      filter(
        "range", id = "copies", dataset = "books", 
        variable = "copies", range = c(-Inf, 5)
      )
    ),
    run_flow = TRUE
  )

Managing filters

In order to manage filters configuration you may call the following methods:

Updating filter:

librarian_cohort %>% 
  update_filter(
    step_id = 1, filter_id = "author", value = c("Dan Brown", "Khaled Hosseini")
  )

sum_up(librarian_cohort)
#> >> Step ID: 1
#> -> Filter ID: author
#>    Filter Type: discrete
#>    Filter Parameters:
#>      dataset: books
#>      variable: author
#>      value: Dan Brown, Khaled Hosseini
#>      keep_na: TRUE
#>      description: 
#>      active: TRUE
#> -> Filter ID: program
#>    Filter Type: discrete
#>    Filter Parameters:
#>      dataset: borrowers
#>      variable: program
#>      value: premium
#>      keep_na: FALSE
#>      description: 
#>      active: TRUE
#> >> Step ID: 2
#> -> Filter ID: copies
#>    Filter Type: range
#>    Filter Parameters:
#>      dataset: books
#>      variable: copies
#>      range: -Inf, 5
#>      keep_na: TRUE
#>      description: 
#>      active: TRUE

Adding new filter:

librarian_cohort %>% 
  add_filter(
    filter(
      "date_range", id = "issue_date", dataset = "issues", 
      variable = "date", range = c(as.Date("2010-01-01"), Inf)
    ),
    step_id = 2
  )

sum_up(librarian_cohort)
#> >> Step ID: 1
#> -> Filter ID: author
#>    Filter Type: discrete
#>    Filter Parameters:
#>      dataset: books
#>      variable: author
#>      value: Dan Brown, Khaled Hosseini
#>      keep_na: TRUE
#>      description: 
#>      active: TRUE
#> -> Filter ID: program
#>    Filter Type: discrete
#>    Filter Parameters:
#>      dataset: borrowers
#>      variable: program
#>      value: premium
#>      keep_na: FALSE
#>      description: 
#>      active: TRUE
#> >> Step ID: 2
#> -> Filter ID: copies
#>    Filter Type: range
#>    Filter Parameters:
#>      dataset: books
#>      variable: copies
#>      range: -Inf, 5
#>      keep_na: TRUE
#>      description: 
#>      active: TRUE
#> -> Filter ID: issue_date
#>    Filter Type: date_range
#>    Filter Parameters:
#>      dataset: issues
#>      variable: date
#>      range: 2010-01-01, NA
#>      keep_na: TRUE
#>      description: 
#>      active: TRUE

Removing filter:

librarian_cohort %>% 
  rm_filter(step_id = 2, filter_id = "copies")

sum_up(librarian_cohort)
#> >> Step ID: 1
#> -> Filter ID: author
#>    Filter Type: discrete
#>    Filter Parameters:
#>      dataset: books
#>      variable: author
#>      value: Dan Brown, Khaled Hosseini
#>      keep_na: TRUE
#>      description: 
#>      active: TRUE
#> -> Filter ID: program
#>    Filter Type: discrete
#>    Filter Parameters:
#>      dataset: borrowers
#>      variable: program
#>      value: premium
#>      keep_na: FALSE
#>      description: 
#>      active: TRUE
#> >> Step ID: 2
#> -> Filter ID: issue_date
#>    Filter Type: date_range
#>    Filter Parameters:
#>      dataset: issues
#>      variable: date
#>      range: 2010-01-01, NA
#>      keep_na: TRUE
#>      description: 
#>      active: TRUE

By default the above configuration doesn’t trigger data recalculation so we need to call run method.

Calling run we trigger all steps computations. In our case we’ve updated only the second step so we can optimize workflow skipping the previous steps calculation by specifying min_step_id parameter:

run(librarian_cohort, min_step_id = 2)

get_data(librarian_cohort)
#> $books
#> # A tibble: 2 × 6
#>   isbn          title             genre                publisher  author  copies
#>   <chr>         <chr>             <chr>                <chr>      <chr>    <int>
#> 1 0-385-50420-9 The Da Vinci Code Crime, Thriller & A… Transworld Dan Br…      7
#> 2 0-671-02735-2 Angels and Demons Crime, Thriller & A… Transworld Dan Br…      4
#> 
#> $borrowers
#> # A tibble: 6 × 6
#>   id     registered address                  name           phone_number program
#>   <chr>  <date>     <chr>                    <chr>          <chr>        <chr>  
#> 1 000001 2001-06-09 66 N. Evergreen Ave. No… Mrs. Freddie … 626-594-4729 premium
#> 2 000005 2005-01-15 580 Chapel Rd. Delray B… Ferdinand Ber… 127-363-0738 premium
#> 3 000008 2006-11-15 9533 Delaware Dr. Peabo… Mrs. Ermine H… 460-779-8714 premium
#> 4 000011 2009-03-24 745 E. Sussex Drive Mah… Mr. Sullivan … 378-884-6509 premium
#> 5 000013 2011-09-30 534 Iroquois Ave. Water… Dr. Sharif Ku… 104-832-8013 premium
#> # … with 1 more row
#> 
#> $issues
#> # A tibble: 42 × 4
#>   id     borrower_id isbn          date      
#>   <chr>  <chr>       <chr>         <date>    
#> 1 000001 000019      0-676-97976-9 2015-03-17
#> 2 000003 000016      0-09-177373-3 2014-09-28
#> 3 000006 000018      0-14-303714-5 2016-07-21
#> 4 000008 000016      0-340-89696-5 2016-04-16
#> 5 000009 000017      0-09-177373-3 2016-11-12
#> # … with 37 more rows
#> 
#> $returns
#> # A tibble: 30 × 2
#>   id     date      
#>   <chr>  <date>    
#> 1 000001 2015-04-06
#> 2 000003 2014-10-23
#> 3 000004 2005-12-29
#> 4 000005 2006-03-26
#> 5 000006 2016-08-30
#> # … with 25 more rows
#> 
#> attr(,"class")
#> [1] "tblist"
#> attr(,"call")
#> as.tblist(librarian)

Note. If you want to run data computation directly after calling one of the above methods just set run_flow = TRUE within the method.

Managing steps

Similar to filter, you can operate on the Cohort to manage steps. cohortBuilder offers add_step and rm_step methods to add new, or remove existing step respectively.

librarian_cohort %>% 
  rm_step(step_id = 1)

sum_up(librarian_cohort)
#> >> Step ID: 2
#> -> Filter ID: issue_date
#>    Filter Type: date_range
#>    Filter Parameters:
#>      dataset: issues
#>      variable: date
#>      range: 2010-01-01, NA
#>      keep_na: TRUE
#>      description: 
#>      active: TRUE

Note. Removing not the last step results with renaming all step ids (so that we always have steps numbering starting with 1).

librarian_cohort %>% 
  add_step(
    step(
      filter(
        "discrete", id = "author", dataset = "books", 
        variable = "author", value = "Dan Brown"
      ),
      filter(
        "discrete", id = "program", dataset = "borrowers", 
        variable = "program", value = "premium", keep_na = FALSE
      )
    )
  )

sum_up(librarian_cohort)
#> >> Step ID: 2
#> -> Filter ID: issue_date
#>    Filter Type: date_range
#>    Filter Parameters:
#>      dataset: issues
#>      variable: date
#>      range: 2010-01-01, NA
#>      keep_na: TRUE
#>      description: 
#>      active: TRUE
#> >> Step ID: 2
#> -> Filter ID: author
#>    Filter Type: discrete
#>    Filter Parameters:
#>      dataset: books
#>      variable: author
#>      value: Dan Brown
#>      keep_na: TRUE
#>      description: 
#>      active: TRUE
#> -> Filter ID: program
#>    Filter Type: discrete
#>    Filter Parameters:
#>      dataset: borrowers
#>      variable: program
#>      value: premium
#>      keep_na: FALSE
#>      description: 
#>      active: TRUE

Note. All the methods used for managing steps and filters can be also called on Source object itself. See vignette("cohort-configuration").

Managing source

The last Cohort configuration component - source, can be also managed within the Cohort itself. With update_source method you can change the source defined in the existing Cohort.

Below we update cohort with Source having source_code parameter defined:

code(librarian_cohort)
#> .pre_filtering <- function(source, data_object, step_id) {
#>     for (dataset in names(data_object)) {
#>         attr(data_object[[dataset]], "filtered") <- FALSE
#>     }
#>     return(data_object)
#> }
#> source <- list(dtconn = as.tblist(librarian))
#> data_object <- source$dtconn
#> # step 1
#> step_id <- "1"
#> data_object <- .pre_filtering(source, data_object, step_id)
#> data_object[["issues"]] <- data_object[["issues"]] %>%
#>     dplyr::filter((date <= Inf & date >= 14610) | is.na(date))
#> attr(data_object[["issues"]], "filtered") <- TRUE
#> # step 2
#> step_id <- "2"
#> data_object <- .pre_filtering(source, data_object, step_id)
#> data_object[["books"]] <- data_object[["books"]] %>%
#>     dplyr::filter(author %in% c("Dan Brown", NA))
#> attr(data_object[["books"]], "filtered") <- TRUE
#> data_object[["borrowers"]] <- data_object[["borrowers"]] %>%
#>     dplyr::filter(program %in% "premium")
#> attr(data_object[["borrowers"]], "filtered") <- TRUE
#> data_object
new_source <- set_source(
  as.tblist(librarian),
  source_code = quote({
    source <- list(attributes = list(datasets = librarian))
  })
)

update_source(librarian_cohort, new_source)
code(librarian_cohort)
#> .pre_filtering <- function(source, data_object, step_id) {
#>     for (dataset in names(data_object)) {
#>         attr(data_object[[dataset]], "filtered") <- FALSE
#>     }
#>     return(data_object)
#> }
#> source <- list(attributes = list(datasets = librarian))
#> data_object <- source$dtconn
#> # step 1
#> step_id <- "1"
#> data_object <- .pre_filtering(source, data_object, step_id)
#> data_object[["issues"]] <- data_object[["issues"]] %>%
#>     dplyr::filter((date <= Inf & date >= 14610) | is.na(date))
#> attr(data_object[["issues"]], "filtered") <- TRUE
#> # step 2
#> step_id <- "2"
#> data_object <- .pre_filtering(source, data_object, step_id)
#> data_object[["books"]] <- data_object[["books"]] %>%
#>     dplyr::filter(author %in% c("Dan Brown", NA))
#> attr(data_object[["books"]], "filtered") <- TRUE
#> data_object[["borrowers"]] <- data_object[["borrowers"]] %>%
#>     dplyr::filter(program %in% "premium")
#> attr(data_object[["borrowers"]], "filtered") <- TRUE
#> data_object
sum_up(librarian_cohort)
#> >> Step ID: 1
#> -> Filter ID: issue_date
#>    Filter Type: date_range
#>    Filter Parameters:
#>      dataset: issues
#>      variable: date
#>      range: 2010-01-01, NA
#>      keep_na: TRUE
#>      description: 
#>      active: TRUE
#> >> Step ID: 2
#> -> Filter ID: author
#>    Filter Type: discrete
#>    Filter Parameters:
#>      dataset: books
#>      variable: author
#>      value: Dan Brown
#>      keep_na: TRUE
#>      description: 
#>      active: TRUE
#> -> Filter ID: program
#>    Filter Type: discrete
#>    Filter Parameters:
#>      dataset: borrowers
#>      variable: program
#>      value: premium
#>      keep_na: FALSE
#>      description: 
#>      active: TRUE
code(librarian_cohort)
#> .pre_filtering <- function(source, data_object, step_id) {
#>     for (dataset in names(data_object)) {
#>         attr(data_object[[dataset]], "filtered") <- FALSE
#>     }
#>     return(data_object)
#> }
#> source <- list(attributes = list(datasets = librarian))
#> data_object <- source$dtconn
#> # step 1
#> step_id <- "1"
#> data_object <- .pre_filtering(source, data_object, step_id)
#> data_object[["issues"]] <- data_object[["issues"]] %>%
#>     dplyr::filter((date <= Inf & date >= 14610) | is.na(date))
#> attr(data_object[["issues"]], "filtered") <- TRUE
#> # step 2
#> step_id <- "2"
#> data_object <- .pre_filtering(source, data_object, step_id)
#> data_object[["books"]] <- data_object[["books"]] %>%
#>     dplyr::filter(author %in% c("Dan Brown", NA))
#> attr(data_object[["books"]], "filtered") <- TRUE
#> data_object[["borrowers"]] <- data_object[["borrowers"]] %>%
#>     dplyr::filter(program %in% "premium")
#> attr(data_object[["borrowers"]], "filtered") <- TRUE
#> data_object

Note that updating source doesn’t remove Cohort configuration (steps and filters). If you want to clear the configuration just set keep_steps = FALSE:

update_source(librarian_cohort, new_source, keep_steps = FALSE)
sum_up(librarian_cohort)
#> No steps configuration found.

You can also use update_source to add Source to an empty Cohort:

empty_cohort <- cohort()
update_source(librarian_cohort, new_source)
sum_up(empty_cohort)
#> No steps configuration found.

The update_source method can be also useful if you want to update source along with steps and filters configuration.

In this case, the good practice is to keep the configuration directly in Source:

source_one <- set_source(
  as.tblist(librarian)
) %>% 
  add_step(
    step(
      filter(
        "discrete", id = "author", dataset = "books", 
        variable = "author", value = "Dan Brown"
      ),
      filter(
        "discrete", id = "program", dataset = "borrowers", 
        variable = "program", value = "premium", keep_na = FALSE
      )
    )
  )

source_two <- set_source(
  as.tblist(librarian)
) %>% 
  add_step(
    step(
      filter(
        "range", id = "copies", dataset = "books", 
        variable = "copies", range = c(-Inf, 5)
      )
    )
  )

my_cohort <- cohort(source_one)
sum_up(my_cohort)
#> >> Step ID: 1
#> -> Filter ID: author
#>    Filter Type: discrete
#>    Filter Parameters:
#>      dataset: books
#>      variable: author
#>      value: Dan Brown
#>      keep_na: TRUE
#>      description: 
#>      active: TRUE
#> -> Filter ID: program
#>    Filter Type: discrete
#>    Filter Parameters:
#>      dataset: borrowers
#>      variable: program
#>      value: premium
#>      keep_na: FALSE
#>      description: 
#>      active: TRUE
update_source(my_cohort, source_two)
sum_up(my_cohort)
#> >> Step ID: 1
#> -> Filter ID: copies
#>    Filter Type: range
#>    Filter Parameters:
#>      dataset: books
#>      variable: copies
#>      range: -Inf, 5
#>      keep_na: TRUE
#>      description: 
#>      active: TRUE