Introduction to R - Part 2

Laboratory of Statistics and Mathematics 2025/2026

Giuseppe Alfonzetti

Rstudio

Recap of R basics

Basic math:

a = 3                            # numeric value assigned with = 
b <- 2                           # numeric value assigne with <- 
a+b                              # sum
a-b                              # difference
a*b                              # multiply
a/b                              # ratio
a^b                              # power

R Functions: always called by name_of_the_function()

v1 <- c(1, 7, 2)          # create a vector with the function c()
v2 <- seq(0,10, by=0.5)   # create a vector with the function seq()
m <- matrix(c(0,7,9,2,4,10), nrow=2, ncol=3) # create a matrix by column

Accessing elements: Access elements within vector and matrices with [ ].

v1[2]   # access the second element of v1
[1] 7
m[2,3]  # access the elemnt in the second row - third column cell
[1] 10

Additional types of objects (i)

Logical values: represent a TRUE/FALSEstatement

# store in the object `my_logical` the answer to "is 3 less than 4?". 
# Can only take values TRUE/FALSE
my_logical = 3 < 4  
my_logical
[1] TRUE

Factors: categorical variables with known levels

# start from a raw vector of strings
my_raw_var <- c("low", "low", "medium", "low", "high", "high", "medium")
my_raw_var
[1] "low"    "low"    "medium" "low"    "high"   "high"   "medium"
# convert automatically to a factor via `as.factor()`
my_fact <- as.factor(my_raw_var)
my_fact
[1] low    low    medium low    high   high   medium
Levels: high low medium
# manual construction with `factor()` specifying levels ordering
my_fact <- factor(my_raw_var, levels = c("low", "medium", "high"), ordered=TRUE)
my_fact
[1] low    low    medium low    high   high   medium
Levels: low < medium < high

Additional types of objects (ii)

Lists:

List of objects of potentially different nature

my_list <- list(          # use the `list()` function
    "A"=c(0,1,2),
    "B"= "ciao",
    "C"= 2<5
)
my_list
$A
[1] 0 1 2

$B
[1] "ciao"

$C
[1] TRUE

Data frames

  • Matrices in R constrain all columns to have data of the same type (all numeric, all character, etc…).
  • Data frames are more convenient objects where to store your data.
var1 <- c("pop-up", "banner", "video")
var2 <- c(56, 23, 321)
var3 <- c(7,2,25)
var4 <- c(TRUE, FALSE, FALSE)
my_df <- data.frame(
    "ad_type"= var1, 
    "n_clicks"=var2, 
    "sales"=var3, 
    "weekend"=var4)
my_df
  ad_type n_clicks sales weekend
1  pop-up       56     7    TRUE
2  banner       23     2   FALSE
3   video      321    25   FALSE

Data frames attributes

colnames(my_df) # get the column names from my_df
[1] "ad_type"  "n_clicks" "sales"    "weekend" 
dim(my_df)       # get the dimensions of my_df
[1] 3 4
my_df$ad_type    # access a column of my_df by name
[1] "pop-up" "banner" "video" 
my_df[ , 1]      # access a column by index
[1] "pop-up" "banner" "video" 
my_df[2 , ]      # acces a row by index
  ad_type n_clicks sales weekend
2  banner       23     2   FALSE

Practice

Workflow basics

  • Create a folder on your laptop where to store all your future analysis;
  • Open Rstudio and create an R Project for today’s practice within that folder.
  • From your file browser, move the downloaded .xlsx file in your project directory in a subdirectory called data.
  • Within the project, create an R Script file. Here you will write your R commands.
  • Use the here package to manage file paths.

Setup

  • On top of your script file, list the library() commands needed to perform the analysis. Today we will use
library(tidyverse)  # data manipulation and plotting
library(here)       # utilities to easily manage directories paths
library(readxl)     # utilities to read Excel files
library(writexl)    # utilities to write Excel files

After loading the R packages, use the i_am() function from the here package to tell R where your script is located. If your script is called my_script.R, run

i_am("my_script.R")

Now that R knows your position in the directory tree of your laptop, we can read the xlsx file stored in the data folder with

path_to_data <- here("data", "employees.xlsx") # get the exact location of the data
dt <- read_xlsx(path_to_data)                  # read the data
# A tibble: 4 × 6
  last_name first_name department seniority  salary    ID
  <chr>     <chr>      <chr>          <dbl>   <dbl> <dbl>
1 al-Harron Fikra      Marketing          4  72654.  1001
2 Whitaker  Jalen      HR                15 164507.  1002
3 Pillow    Cleevens   Sales              7 102665.  1003
4 Holguin   Austin     I.T.               9 138793.  1004

A first plot

ggplot(data = dt, aes(x=seniority, y=salary))

A first plot

ggplot(data = dt, aes(x=seniority, y=salary)) +
    geom_point()

A first plot

ggplot(data = dt, aes(x=seniority, y=salary)) +
    geom_point(aes(color=department))

Data manipulation

Visualise all employees in the Marketing department:

filter(dt, department=="Marketing")
# A tibble: 22 × 6
   last_name first_name department seniority  salary    ID
   <chr>     <chr>      <chr>          <dbl>   <dbl> <dbl>
 1 al-Harron Fikra      Marketing          4  72654.  1001
 2 Martinez  Yessica    Marketing          9 108185.  1023
 3 Green     Sarye      Marketing         15 154446.  1029
 4 Wall      Kayla      Marketing          4  79165.  1063
 5 Wynter    Michael    Marketing          2  64957.  1065
 6 Briseno   Dominick   Marketing         15 154728.  1073
 7 Hennefeld Mitchell   Marketing          6  85930.  1074
 8 Mccarthy  Angelica   Marketing         12 137013.  1080
 9 Diltz     Zachary    Marketing          4  75023.  1087
10 Magor     Jacob      Marketing         15 156230.  1093
# ℹ 12 more rows

Sort employees by increasing seniority

arrange(dt, seniority)
# A tibble: 150 × 6
   last_name     first_name department seniority salary    ID
   <chr>         <chr>      <chr>          <dbl>  <dbl> <dbl>
 1 al-Momin      Mu'mina    Sales              0 47359.  1009
 2 Heng          Marina     Accounting         0 42770.  1026
 3 Huynh         Alicia     Finance            0 61532.  1027
 4 Zavala        Kristina   Finance            0 61431.  1068
 5 el-Ameen      Haneef     Finance            0 58284.  1089
 6 al-Abdallah   Labeeb     Finance            0 59477.  1106
 7 Hiler         Margaret   Finance            0 60065.  1123
 8 Zheng         Brittany   I.T.               0 52634.  1126
 9 al-Jamal      Muna       Marketing          0 42664.  1128
10 Apodaca-Anaya Daniel     Marketing          0 43220.  1148
# ℹ 140 more rows

Pipes

Select employees in the Marketing department AND sort them by increasing seniority:

filter(dt, department=="Marketing") |> 
    arrange(seniority)
# A tibble: 22 × 6
   last_name        first_name department seniority salary    ID
   <chr>            <chr>      <chr>          <dbl>  <dbl> <dbl>
 1 al-Jamal         Muna       Marketing          0 42664.  1128
 2 Apodaca-Anaya    Daniel     Marketing          0 43220.  1148
 3 Wynter           Michael    Marketing          2 64957.  1065
 4 al-Harron        Fikra      Marketing          4 72654.  1001
 5 Wall             Kayla      Marketing          4 79165.  1063
 6 Diltz            Zachary    Marketing          4 75023.  1087
 7 Tuccy            Samantha   Marketing          5 81015.  1110
 8 Hennefeld        Mitchell   Marketing          6 85930.  1074
 9 Gonzalez-Bolivar Luis       Marketing          6 87191.  1108
10 Raibon           Taneja     Marketing          6 87482.  1111
# ℹ 12 more rows

Groupwise manipulations

group_by(dt, department) |> 
    summarise(total_salary = sum(salary))
# A tibble: 6 × 2
  department total_salary
  <chr>             <dbl>
1 Accounting     1856399.
2 Finance        3004418.
3 HR             2142249.
4 I.T.           3120520.
5 Marketing      2394822.
6 Sales          3049907.
group_by(dt, department) |> 
    summarise(
        total_salary = sum(salary),    # sum of the salary variable by group
        average_salary = mean(salary), # average of the salary variable by group
        n_employees = n()              # number of rows by group
        )
# A tibble: 6 × 4
  department total_salary average_salary n_employees
  <chr>             <dbl>          <dbl>       <int>
1 Accounting     1856399.         80713.          23
2 Finance        3004418.         93888.          32
3 HR             2142249.        107112.          20
4 I.T.           3120520.        124821.          25
5 Marketing      2394822.        108856.          22
6 Sales          3049907.        108925.          28

Export results

Let’s say we want to export a summary table of the data and the scatterplot for salary and seniority:

my_summary <- group_by(dt, department) |> 
                summarise(total_salary = sum(salary))

my_plot <- ggplot(dt, aes(x=seniority, y=salary))+
              geom_point(aes(, col=department)) 
  • Export them as R objects:
save(my_summary, my_plot, file = here("my_results.rda"))
  • Export the table as an Excel file:
write_xlsx(my_summary, path = here("my_table.xlsx"))
  • Export the plot as an image:
ggsave(my_plot, path = here("my_scatterplot.jpg"))