Data manipulation

Laboratory of Statistics and Mathematics 2025/2026

Giuseppe Alfonzetti

R basics - filtering

Consider the data.frame

my_df
  id ad_type n_clicks sales weekend
1  1  pop-up       56     7    TRUE
2  2  banner       23     2   FALSE
3  3   video      321    25   FALSE
4  4  banner       89    10    TRUE
5  5  pop-up       10    23    TRUE

Create a filter:

my_df$ad_type == "banner"
[1] FALSE  TRUE FALSE  TRUE FALSE
my_filter = my_df$sales > 20
my_filter
[1] FALSE FALSE  TRUE FALSE  TRUE

View:

my_df[my_filter, ]
  id ad_type n_clicks sales weekend
3  3   video      321    25   FALSE
5  5  pop-up       10    23    TRUE

Overwrite:

my_df[my_filter, ]$ad_type = c("banner", "banner")
my_df
  id ad_type n_clicks sales weekend
1  1  pop-up       56     7    TRUE
2  2  banner       23     2   FALSE
3  3  banner      321    25   FALSE
4  4  banner       89    10    TRUE
5  5  banner       10    23    TRUE

R basics - Missing values

Consider the data.frame

my_df
  id ad_type n_clicks sales weekend
1  1  pop-up       56     7    TRUE
2  2  banner       23    NA   FALSE
3  3   video      321    25   FALSE
4  4  banner       NA    10    TRUE
5  5  pop-up       10    23    TRUE
summary(my_df)
       id      ad_type             n_clicks          sales      
 Min.   :1   Length:5           Min.   : 10.00   Min.   : 7.00  
 1st Qu.:2   Class :character   1st Qu.: 19.75   1st Qu.: 9.25  
 Median :3   Mode  :character   Median : 39.50   Median :16.50  
 Mean   :3                      Mean   :102.50   Mean   :16.25  
 3rd Qu.:4                      3rd Qu.:122.25   3rd Qu.:23.50  
 Max.   :5                      Max.   :321.00   Max.   :25.00  
                                NA's   :1        NA's   :1      
  weekend       
 Mode :logical  
 FALSE:2        
 TRUE :3        
                
                
                
                

R basics - Missing values

Consider the data.frame

my_df
  id ad_type n_clicks sales weekend
1  1  pop-up       56     7    TRUE
2  2  banner       23    NA   FALSE
3  3   video      321    25   FALSE
4  4  banner       NA    10    TRUE
5  5  pop-up       10    23    TRUE

Locate:

clicks_missing <- is.na(my_df$n_clicks)
clicks_missing
[1] FALSE FALSE FALSE  TRUE FALSE

Replace:

my_df$n_clicks[clicks_missing] <- median(my_df$n_clicks, na.rm=TRUE)
my_df
  id ad_type n_clicks sales weekend
1  1  pop-up     56.0     7    TRUE
2  2  banner     23.0    NA   FALSE
3  3   video    321.0    25   FALSE
4  4  banner     39.5    10    TRUE
5  5  pop-up     10.0    23    TRUE

Data manipulation with tidyverse

Consider this dataset:

library(tidyverse)
penguins
# A tibble: 344 × 8
   species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
   <fct>   <fct>              <dbl>         <dbl>             <int>       <int>
 1 Adelie  Torgersen           39.1          18.7               181        3750
 2 Adelie  Torgersen           39.5          17.4               186        3800
 3 Adelie  Torgersen           40.3          18                 195        3250
 4 Adelie  Torgersen           NA            NA                  NA          NA
 5 Adelie  Torgersen           36.7          19.3               193        3450
 6 Adelie  Torgersen           39.3          20.6               190        3650
 7 Adelie  Torgersen           38.9          17.8               181        3625
 8 Adelie  Torgersen           39.2          19.6               195        4675
 9 Adelie  Torgersen           34.1          18.1               193        3475
10 Adelie  Torgersen           42            20.2               190        4250
# ℹ 334 more rows
# ℹ 2 more variables: sex <fct>, year <int>

Select:

my_penguins <- select(penguins, species, island, sex, bill_length_mm, body_mass_g, year)
my_penguins
# A tibble: 344 × 6
   species island    sex    bill_length_mm body_mass_g  year
   <fct>   <fct>     <fct>           <dbl>       <int> <int>
 1 Adelie  Torgersen male             39.1        3750  2007
 2 Adelie  Torgersen female           39.5        3800  2007
 3 Adelie  Torgersen female           40.3        3250  2007
 4 Adelie  Torgersen <NA>             NA            NA  2007
 5 Adelie  Torgersen female           36.7        3450  2007
 6 Adelie  Torgersen male             39.3        3650  2007
 7 Adelie  Torgersen female           38.9        3625  2007
 8 Adelie  Torgersen male             39.2        4675  2007
 9 Adelie  Torgersen <NA>             34.1        3475  2007
10 Adelie  Torgersen <NA>             42          4250  2007
# ℹ 334 more rows

Data manipulation with tidyverse

Filter:

filter(my_penguins, sex=="female")
# A tibble: 165 × 6
   species island    sex    bill_length_mm body_mass_g  year
   <fct>   <fct>     <fct>           <dbl>       <int> <int>
 1 Adelie  Torgersen female           39.5        3800  2007
 2 Adelie  Torgersen female           40.3        3250  2007
 3 Adelie  Torgersen female           36.7        3450  2007
 4 Adelie  Torgersen female           38.9        3625  2007
 5 Adelie  Torgersen female           41.1        3200  2007
 6 Adelie  Torgersen female           36.6        3700  2007
 7 Adelie  Torgersen female           38.7        3450  2007
 8 Adelie  Torgersen female           34.4        3325  2007
 9 Adelie  Biscoe    female           37.8        3400  2007
10 Adelie  Biscoe    female           35.9        3800  2007
# ℹ 155 more rows
filter(my_penguins, sex=="female", island=="Biscoe")
# A tibble: 80 × 6
   species island sex    bill_length_mm body_mass_g  year
   <fct>   <fct>  <fct>           <dbl>       <int> <int>
 1 Adelie  Biscoe female           37.8        3400  2007
 2 Adelie  Biscoe female           35.9        3800  2007
 3 Adelie  Biscoe female           35.3        3800  2007
 4 Adelie  Biscoe female           40.5        3200  2007
 5 Adelie  Biscoe female           37.9        3150  2007
 6 Adelie  Biscoe female           39.6        3500  2008
 7 Adelie  Biscoe female           35          3450  2008
 8 Adelie  Biscoe female           34.5        2900  2008
 9 Adelie  Biscoe female           39          3550  2008
10 Adelie  Biscoe female           36.5        2850  2008
# ℹ 70 more rows

Data manipulation with tidyverse

Mutate:

mutate(my_penguins, body_mass_kg=body_mass_g/1000)
# A tibble: 344 × 7
   species island    sex    bill_length_mm body_mass_g  year body_mass_kg
   <fct>   <fct>     <fct>           <dbl>       <int> <int>        <dbl>
 1 Adelie  Torgersen male             39.1        3750  2007         3.75
 2 Adelie  Torgersen female           39.5        3800  2007         3.8 
 3 Adelie  Torgersen female           40.3        3250  2007         3.25
 4 Adelie  Torgersen <NA>             NA            NA  2007        NA   
 5 Adelie  Torgersen female           36.7        3450  2007         3.45
 6 Adelie  Torgersen male             39.3        3650  2007         3.65
 7 Adelie  Torgersen female           38.9        3625  2007         3.62
 8 Adelie  Torgersen male             39.2        4675  2007         4.68
 9 Adelie  Torgersen <NA>             34.1        3475  2007         3.48
10 Adelie  Torgersen <NA>             42          4250  2007         4.25
# ℹ 334 more rows
mutate(my_penguins, body_mass_g=body_mass_g+1)
# A tibble: 344 × 6
   species island    sex    bill_length_mm body_mass_g  year
   <fct>   <fct>     <fct>           <dbl>       <dbl> <int>
 1 Adelie  Torgersen male             39.1        3751  2007
 2 Adelie  Torgersen female           39.5        3801  2007
 3 Adelie  Torgersen female           40.3        3251  2007
 4 Adelie  Torgersen <NA>             NA            NA  2007
 5 Adelie  Torgersen female           36.7        3451  2007
 6 Adelie  Torgersen male             39.3        3651  2007
 7 Adelie  Torgersen female           38.9        3626  2007
 8 Adelie  Torgersen male             39.2        4676  2007
 9 Adelie  Torgersen <NA>             34.1        3476  2007
10 Adelie  Torgersen <NA>             42          4251  2007
# ℹ 334 more rows

Data manipulation with tidyverse

Mutate according to logical conditions:

my_penguins
# A tibble: 344 × 6
   species island    sex    bill_length_mm body_mass_g  year
   <fct>   <fct>     <fct>           <dbl>       <int> <int>
 1 Adelie  Torgersen male             39.1        3750  2007
 2 Adelie  Torgersen female           39.5        3800  2007
 3 Adelie  Torgersen female           40.3        3250  2007
 4 Adelie  Torgersen <NA>             NA            NA  2007
 5 Adelie  Torgersen female           36.7        3450  2007
 6 Adelie  Torgersen male             39.3        3650  2007
 7 Adelie  Torgersen female           38.9        3625  2007
 8 Adelie  Torgersen male             39.2        4675  2007
 9 Adelie  Torgersen <NA>             34.1        3475  2007
10 Adelie  Torgersen <NA>             42          4250  2007
# ℹ 334 more rows
mutate(
    my_penguins, 
    bill_length_mm = replace(bill_length_mm, bill_length_mm>39, 100)
)
# A tibble: 344 × 6
   species island    sex    bill_length_mm body_mass_g  year
   <fct>   <fct>     <fct>           <dbl>       <int> <int>
 1 Adelie  Torgersen male            100          3750  2007
 2 Adelie  Torgersen female          100          3800  2007
 3 Adelie  Torgersen female          100          3250  2007
 4 Adelie  Torgersen <NA>             NA            NA  2007
 5 Adelie  Torgersen female           36.7        3450  2007
 6 Adelie  Torgersen male            100          3650  2007
 7 Adelie  Torgersen female           38.9        3625  2007
 8 Adelie  Torgersen male            100          4675  2007
 9 Adelie  Torgersen <NA>             34.1        3475  2007
10 Adelie  Torgersen <NA>            100          4250  2007
# ℹ 334 more rows

Data manipulation with tidyverse

Replace Missing values

my_penguins
# A tibble: 344 × 6
   species island    sex    bill_length_mm body_mass_g  year
   <fct>   <fct>     <fct>           <dbl>       <int> <int>
 1 Adelie  Torgersen male             39.1        3750  2007
 2 Adelie  Torgersen female           39.5        3800  2007
 3 Adelie  Torgersen female           40.3        3250  2007
 4 Adelie  Torgersen <NA>             NA            NA  2007
 5 Adelie  Torgersen female           36.7        3450  2007
 6 Adelie  Torgersen male             39.3        3650  2007
 7 Adelie  Torgersen female           38.9        3625  2007
 8 Adelie  Torgersen male             39.2        4675  2007
 9 Adelie  Torgersen <NA>             34.1        3475  2007
10 Adelie  Torgersen <NA>             42          4250  2007
# ℹ 334 more rows
mutate(my_penguins, body_mass_g_filled = replace(body_mass_g, is.na(body_mass_g), median(body_mass_g, na.rm=TRUE)))
# A tibble: 344 × 7
   species island    sex    bill_length_mm body_mass_g  year body_mass_g_filled
   <fct>   <fct>     <fct>           <dbl>       <int> <int>              <dbl>
 1 Adelie  Torgersen male             39.1        3750  2007               3750
 2 Adelie  Torgersen female           39.5        3800  2007               3800
 3 Adelie  Torgersen female           40.3        3250  2007               3250
 4 Adelie  Torgersen <NA>             NA            NA  2007               4050
 5 Adelie  Torgersen female           36.7        3450  2007               3450
 6 Adelie  Torgersen male             39.3        3650  2007               3650
 7 Adelie  Torgersen female           38.9        3625  2007               3625
 8 Adelie  Torgersen male             39.2        4675  2007               4675
 9 Adelie  Torgersen <NA>             34.1        3475  2007               3475
10 Adelie  Torgersen <NA>             42          4250  2007               4250
# ℹ 334 more rows

Data manipulation with tidyverse

Replace Missing values

mutate(
    my_penguins, 
    body_mass_g = replace(body_mass_g, is.na(body_mass_g), median(body_mass_g, na.rm=TRUE)),
    bill_length_mm = replace(bill_length_mm, is.na(bill_length_mm), median(bill_length_mm, na.rm=TRUE))
)
# A tibble: 344 × 6
   species island    sex    bill_length_mm body_mass_g  year
   <fct>   <fct>     <fct>           <dbl>       <dbl> <int>
 1 Adelie  Torgersen male             39.1        3750  2007
 2 Adelie  Torgersen female           39.5        3800  2007
 3 Adelie  Torgersen female           40.3        3250  2007
 4 Adelie  Torgersen <NA>             44.4        4050  2007
 5 Adelie  Torgersen female           36.7        3450  2007
 6 Adelie  Torgersen male             39.3        3650  2007
 7 Adelie  Torgersen female           38.9        3625  2007
 8 Adelie  Torgersen male             39.2        4675  2007
 9 Adelie  Torgersen <NA>             34.1        3475  2007
10 Adelie  Torgersen <NA>             42          4250  2007
# ℹ 334 more rows

Data manipulation with tidyverse

Pipes:

penguins
# A tibble: 344 × 8
   species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
   <fct>   <fct>              <dbl>         <dbl>             <int>       <int>
 1 Adelie  Torgersen           39.1          18.7               181        3750
 2 Adelie  Torgersen           39.5          17.4               186        3800
 3 Adelie  Torgersen           40.3          18                 195        3250
 4 Adelie  Torgersen           NA            NA                  NA          NA
 5 Adelie  Torgersen           36.7          19.3               193        3450
 6 Adelie  Torgersen           39.3          20.6               190        3650
 7 Adelie  Torgersen           38.9          17.8               181        3625
 8 Adelie  Torgersen           39.2          19.6               195        4675
 9 Adelie  Torgersen           34.1          18.1               193        3475
10 Adelie  Torgersen           42            20.2               190        4250
# ℹ 334 more rows
# ℹ 2 more variables: sex <fct>, year <int>
penguins |> 
    select(species, island, sex, body_mass_g, bill_length_mm) |> 
    mutate(body_mass_kg=body_mass_g/1000) |> 
    filter(sex=="female") |> 
    arrange(bill_length_mm) |> 
    select(-body_mass_g)
# A tibble: 165 × 5
   species island    sex    bill_length_mm body_mass_kg
   <fct>   <fct>     <fct>           <dbl>        <dbl>
 1 Adelie  Dream     female           32.1         3.05
 2 Adelie  Dream     female           33.1         2.9 
 3 Adelie  Torgersen female           33.5         3.6 
 4 Adelie  Dream     female           34           3.4 
 5 Adelie  Torgersen female           34.4         3.32
 6 Adelie  Biscoe    female           34.5         2.9 
 7 Adelie  Torgersen female           34.6         3.2 
 8 Adelie  Biscoe    female           35           3.45
 9 Adelie  Biscoe    female           35           3.72
10 Adelie  Torgersen female           35.2         3.05
# ℹ 155 more rows

Groupwise manipulation

Summarise by group:

penguins |> 
    group_by(species) |> 
    summarise(average_bill_length = mean(bill_length_mm))
# A tibble: 3 × 2
  species   average_bill_length
  <fct>                   <dbl>
1 Adelie                   NA  
2 Chinstrap                48.8
3 Gentoo                   NA  
penguins |> 
    group_by(species) |> 
    summarise(average_bill_length = mean(bill_length_mm, na.rm=TRUE))
# A tibble: 3 × 2
  species   average_bill_length
  <fct>                   <dbl>
1 Adelie                   38.8
2 Chinstrap                48.8
3 Gentoo                   47.5

Groupwise manipulation

Mutate by group:

penguins |> 
    select(species, island, bill_length_mm) |> 
    group_by(species) |> 
    mutate(species_average_bill_length = mean(bill_length_mm, na.rm=TRUE))
# A tibble: 344 × 4
# Groups:   species [3]
   species island    bill_length_mm species_average_bill_length
   <fct>   <fct>              <dbl>                       <dbl>
 1 Adelie  Torgersen           39.1                        38.8
 2 Adelie  Torgersen           39.5                        38.8
 3 Adelie  Torgersen           40.3                        38.8
 4 Adelie  Torgersen           NA                          38.8
 5 Adelie  Torgersen           36.7                        38.8
 6 Adelie  Torgersen           39.3                        38.8
 7 Adelie  Torgersen           38.9                        38.8
 8 Adelie  Torgersen           39.2                        38.8
 9 Adelie  Torgersen           34.1                        38.8
10 Adelie  Torgersen           42                          38.8
# ℹ 334 more rows
penguins |> 
    select(species, island, bill_length_mm) |> 
    group_by(species) |> 
    mutate(
        species_average_bill_length = mean(bill_length_mm, na.rm=TRUE),
        dist_from_species_average = bill_length_mm - species_average_bill_length
        )
# A tibble: 344 × 5
# Groups:   species [3]
   species island   bill_length_mm species_average_bill…¹ dist_from_species_av…²
   <fct>   <fct>             <dbl>                  <dbl>                  <dbl>
 1 Adelie  Torgers…           39.1                   38.8                  0.309
 2 Adelie  Torgers…           39.5                   38.8                  0.709
 3 Adelie  Torgers…           40.3                   38.8                  1.51 
 4 Adelie  Torgers…           NA                     38.8                 NA    
 5 Adelie  Torgers…           36.7                   38.8                 -2.09 
 6 Adelie  Torgers…           39.3                   38.8                  0.509
 7 Adelie  Torgers…           38.9                   38.8                  0.109
 8 Adelie  Torgers…           39.2                   38.8                  0.409
 9 Adelie  Torgers…           34.1                   38.8                 -4.69 
10 Adelie  Torgers…           42                     38.8                  3.21 
# ℹ 334 more rows
# ℹ abbreviated names: ¹​species_average_bill_length, ²​dist_from_species_average

Save objects

my_res <- penguins |> 
    group_by(species) |> 
    summarise(average_bill_length = mean(bill_length_mm, na.rm=TRUE))
my_res
# A tibble: 3 × 2
  species   average_bill_length
  <fct>                   <dbl>
1 Adelie                   38.8
2 Chinstrap                48.8
3 Gentoo                   47.5

Store as an R data object:

# To save the object in your current project folder under the name "my_results.rda"
save(my_res, here("my_results.rda"))

# To load the object when you need it next time
load(here("my_results.rda"))

Store as an Excel file:

# To save the object in a spreadsheet in your current project folder under the name "my_results.rda"
write_xlsx(my_res, path = here("my_results.xlsx"))