6.4 Combining data sets

SSCC - Social Science Computing Cooperative

Supporting Statistical Analysis for Research

6.4.1 Joining data frames

The join functions create a new data frame by matching rows from two data frames. The data frames are identified as the left side and the right side, also referred to as x and y respectively. The left side data frame is the data frame that is listed first in the parameter list. The left side may be piped into the join function.

The by parameter controls which columns in the two data frames are used to match the rows of the two data frames.

The left_join() function adds columns from the right side to the left side. The added columns will be filled with NAs for rows on the left side that are not matched to the right side. Rows in the right side that do not match the left side are not included.

6.4.2 Examples - R

These examples use the cps1.csv data set.

We begin by loading the tidyverse and import the csv file.

cps_in <- read_csv(file.path("..", "datasets", "cps1.csv"), col_types = cols())

Warning: Missing column names filled in: 'X1' [1]

cps_in <- 
  cps_in %>%
  rename(
    id = X1,
    no_deg = nodeg,
    real_earn_74 = re74,
    real_earn_75 = re75,
    real_earn_78 = re78
    )
cps <-
  cps_in

head(cps, 3)

# A tibble: 3 x 11
     id   trt   age  educ black  hisp  marr no_deg real_earn_74
  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>        <dbl>
1     1     0    45    11     0     0     1      1       21517.
2     2     0    21    14     0     0     0      0        3176.
3     3     0    38    12     0     0     1      0       23039.
# ... with 2 more variables: real_earn_75 <dbl>, real_earn_78 <dbl>

Create two data frames that can be joined.

The real_earn_78 will be removed from one data frame.

cps_part1 <-
  cps %>%
  select(-real_earn_78)

head(cps_part1, 3)

# A tibble: 3 x 10
     id   trt   age  educ black  hisp  marr no_deg real_earn_74
  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>        <dbl>
1     1     0    45    11     0     0     1      1       21517.
2     2     0    21    14     0     0     0      0        3176.
3     3     0    38    12     0     0     1      0       23039.
# ... with 1 more variable: real_earn_75 <dbl>

From the other data frame,real_earn_74 real_earn_75 will be removed.

cps_78 <-
  cps %>%
  select(-real_earn_74, -real_earn_75)

head(cps_78, 3)

# A tibble: 3 x 9
     id   trt   age  educ black  hisp  marr no_deg real_earn_78
  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>        <dbl>
1     1     0    45    11     0     0     1      1       25565.
2     2     0    21    14     0     0     0      0       13496.
3     3     0    38    12     0     0     1      0       25565.

Using left_join() with all common variables.

In this example the left join is used with no by parameter. This results in a natural join, a join that is done using all columns that have the same name in the two tibbles.

The cps_part1 tibble is the left side and cps_78 is the right side.

cps2 <-
  cps_part1 %>%
  left_join(cps_78)

Joining, by = c("id", "trt", "age", "educ", "black", "hisp", "marr", "no_deg")

glimpse(cps2)

Observations: 15,992
Variables: 11
$ id           <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15...
$ trt          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
$ age          <dbl> 45, 21, 38, 48, 18, 22, 48, 18, 48, 45, 34, 16, 5...
$ educ         <dbl> 11, 14, 12, 6, 8, 11, 10, 11, 9, 12, 14, 10, 10, ...
$ black        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
$ hisp         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
$ marr         <dbl> 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1...
$ no_deg       <dbl> 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0...
$ real_earn_74 <dbl> 21516.6700, 3175.9710, 23039.0200, 24994.3700, 16...
$ real_earn_75 <dbl> 25243.550, 5852.565, 25130.760, 25243.550, 10727....
$ real_earn_78 <dbl> 25564.670, 13496.080, 25564.670, 25564.670, 9860....

Using left_join() specifying the common variables to use for matching rows.

In this example the by parameter is used to identify the column to joined on.

cps_78 <- select(cps_78, id, real_earn_78)


cps3 <-
  cps_part1 %>%
  left_join(cps_78, by = c("id"))

glimpse(cps3)

Observations: 15,992
Variables: 11
$ id           <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15...
$ trt          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
$ age          <dbl> 45, 21, 38, 48, 18, 22, 48, 18, 48, 45, 34, 16, 5...
$ educ         <dbl> 11, 14, 12, 6, 8, 11, 10, 11, 9, 12, 14, 10, 10, ...
$ black        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
$ hisp         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
$ marr         <dbl> 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1...
$ no_deg       <dbl> 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0...
$ real_earn_74 <dbl> 21516.6700, 3175.9710, 23039.0200, 24994.3700, 16...
$ real_earn_75 <dbl> 25243.550, 5852.565, 25130.760, 25243.550, 10727....
$ real_earn_78 <dbl> 25564.670, 13496.080, 25564.670, 25564.670, 9860....

Using left_join() specifying the matching variables that have different names.

In this example the by parameter is a name vector to identify differently named columns in the two tibbles.

cps_78 <- rename(cps_78, patient_id = id)
head(cps_78)

# A tibble: 6 x 2
  patient_id real_earn_78
       <dbl>        <dbl>
1          1       25565.
2          2       13496.
3          3       25565.
4          4       25565.
5          5        9861.
6          6       25565.

cps4 <-
  cps_part1 %>%
  left_join(cps_78, by = c("id" = "patient_id"))

glimpse(cps4)

Observations: 15,992
Variables: 11
$ id           <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15...
$ trt          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
$ age          <dbl> 45, 21, 38, 48, 18, 22, 48, 18, 48, 45, 34, 16, 5...
$ educ         <dbl> 11, 14, 12, 6, 8, 11, 10, 11, 9, 12, 14, 10, 10, ...
$ black        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
$ hisp         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
$ marr         <dbl> 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1...
$ no_deg       <dbl> 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0...
$ real_earn_74 <dbl> 21516.6700, 3175.9710, 23039.0200, 24994.3700, 16...
$ real_earn_75 <dbl> 25243.550, 5852.565, 25130.760, 25243.550, 10727....
$ real_earn_78 <dbl> 25564.670, 13496.080, 25564.670, 25564.670, 9860....

Appending tibbles.

We will append the cps training and testing tibbles that were created in earlier examples.

set.seed(145705)

cps <-
  cps %>%
  mutate(
    split = ifelse(runif(n()) > .75, "test", "train")
  )
cps_train <-
  cps %>%
  filter(split == "train")
cps_test <-
  cps %>%
  filter(split == "test")

dim(cps)

[1] 15992    12

dim(cps_train)

[1] 11902    12

dim(cps_test)

[1] 4090   12

The following appends the test data frame to the train data frame using bind_rows().

cps_all_rows <- 
  cps_train %>%
  bind_rows(cps_test)

dim(cps_all_rows)

[1] 15992    12

Some other joins

right_join() - rows in the left side are matched to the right side.
inner_join() - includes only rows that are in both data frames.
full_join() - includes all row that are in either data frames.
semi_join() - keeps rows in left side that match right side. Does not add columns to the data frame. Duplicate rows are dropped.
anti_join() - keeps rows in left side that are not matched in the right side.
nest_join() - adds a column of tibbles to the left side. Each tibble contains the rows of the right side that match the row on the left side.