R programming for beginners (GV900)

Lesson 6: Importing and exporting data

Thursday, January 4, 2024

Video of Lesson 6

1 Setup

Code
library(tidyverse)

2 Using built-in datasets

There are a number of built-in datasets that come with R. You can see a list of them by typing data().

Code
data()

mtcars |> 
  head()
                   mpg cyl disp  hp drat    wt  qsec vs am gear carb
Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1
Code
iris |> 
  head()
  Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1          5.1         3.5          1.4         0.2  setosa
2          4.9         3.0          1.4         0.2  setosa
3          4.7         3.2          1.3         0.2  setosa
4          4.6         3.1          1.5         0.2  setosa
5          5.0         3.6          1.4         0.2  setosa
6          5.4         3.9          1.7         0.4  setosa

3 Using dataset packages

Code
# starwars from tidyverse
starwars |> 
  head()
# A tibble: 6 × 14
  name      height  mass hair_color skin_color eye_color birth_year sex   gender
  <chr>      <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
1 Luke Sky…    172    77 blond      fair       blue            19   male  mascu…
2 C-3PO        167    75 <NA>       gold       yellow         112   none  mascu…
3 R2-D2         96    32 <NA>       white, bl… red             33   none  mascu…
4 Darth Va…    202   136 none       white      yellow          41.9 male  mascu…
5 Leia Org…    150    49 brown      light      brown           19   fema… femin…
6 Owen Lars    178   120 brown, gr… light      blue            52   male  mascu…
# ℹ 5 more variables: homeworld <chr>, species <chr>, films <list>,
#   vehicles <list>, starships <list>
Code
# penguins from palmerpenguins
library(palmerpenguins)
palmerpenguins::penguins_raw |> 
  head()
# A tibble: 6 × 17
  studyName `Sample Number` Species          Region Island Stage `Individual ID`
  <chr>               <dbl> <chr>            <chr>  <chr>  <chr> <chr>          
1 PAL0708                 1 Adelie Penguin … Anvers Torge… Adul… N1A1           
2 PAL0708                 2 Adelie Penguin … Anvers Torge… Adul… N1A2           
3 PAL0708                 3 Adelie Penguin … Anvers Torge… Adul… N2A1           
4 PAL0708                 4 Adelie Penguin … Anvers Torge… Adul… N2A2           
5 PAL0708                 5 Adelie Penguin … Anvers Torge… Adul… N3A1           
6 PAL0708                 6 Adelie Penguin … Anvers Torge… Adul… N3A2           
# ℹ 10 more variables: `Clutch Completion` <chr>, `Date Egg` <date>,
#   `Culmen Length (mm)` <dbl>, `Culmen Depth (mm)` <dbl>,
#   `Flipper Length (mm)` <dbl>, `Body Mass (g)` <dbl>, Sex <chr>,
#   `Delta 15 N (o/oo)` <dbl>, `Delta 13 C (o/oo)` <dbl>, Comments <chr>
Code
library(democracyData)
democracyData::REIGN |> 
  head()
# A tibble: 6 × 14
  reign_cowcode reign_country gwf_casename gwf_startdate gwf_enddate
          <dbl> <chr>         <chr>        <chr>         <chr>      
1             2 USA           USA9/1/1787  3/4/1789      12/31/2020 
2             2 USA           USA9/1/1787  3/4/1789      12/31/2020 
3             2 USA           USA9/1/1787  3/4/1789      12/31/2020 
4             2 USA           USA9/1/1787  3/4/1789      12/31/2020 
5             2 USA           USA9/1/1787  3/4/1789      12/31/2020 
6             2 USA           USA9/1/1787  3/4/1789      12/31/2020 
# ℹ 9 more variables: gwf_regimetype <chr>, Start <date>, End <date>,
#   cow <dbl>, year <dbl>, extended_country_name <chr>, GWn <dbl>, cown <dbl>,
#   in_GW_system <lgl>
Code
library(carData)
carData::BEPS |> 
  head()
              vote age economic.cond.national economic.cond.household Blair
1 Liberal Democrat  43                      3                       3     4
2           Labour  36                      4                       4     4
3           Labour  35                      4                       4     5
4           Labour  24                      4                       2     2
5           Labour  41                      2                       2     1
6           Labour  47                      3                       4     4
  Hague Kennedy Europe political.knowledge gender
1     1       4      2                   2 female
2     4       4      5                   2   male
3     2       3      3                   2   male
4     1       3      4                   0 female
5     1       4      6                   2   male
6     4       2      4                   2   male
Code
library(vdemdata)
view(codebook)
vdem |>
  select(
    ccode = COWcode,
    country_name,
    year,
    coup_attempts = e_pt_coup_attempts,
    coup_succeed = e_pt_coup
  ) |>
  filter(year > 1950,
         coup_attempts > 0) |>
  head()
  ccode country_name year coup_attempts coup_succeed
1   115     Suriname 1980             2            2
2   452        Ghana 1966             1            1
3   452        Ghana 1967             1            0
4   452        Ghana 1972             1            1
5   452        Ghana 1978             1            1
6   452        Ghana 1979             2            1

4 Downloading data from the internet

Code
read.csv("http://ksgleditsch.com/data/1March_Archigos_4.1.txt") |> 
  head()
                                             obsid.leadid.ccode.idacr.leader.startdate.enddate.entry.exit.exitcode.prevtimesinoffice.posttenurefate.gender.yrborn.yrdied.borndate.deathdate.dbpedia.uri.num.entry.num.exit.num.exitcode.num.posttenurefate.fties.ftcur
1                                             USA-1869\t81dcc176-1e42-11e4-b4cd-db5882bf8def\t2\tUSA\tGrant\t1869-03-04\t1877-03-04\tRegular\tRegular\tRegular\t0\tOK\tM\t1822\t1885\tNA\t1885-07-23\thttp://dbpedia.org/resource/Ulysses_S._Grant\t0\t1\t0\t0\tNA\tNA
2                                          USA-1877\t81dcc177-1e42-11e4-b4cd-db5882bf8def\t2\tUSA\tHayes\t1877-03-04\t1881-03-04\tRegular\tRegular\tRegular\t0\tOK\tM\t1822\t1893\tNA\t1893-01-17\thttp://dbpedia.org/resource/Rutherford_B._Hayes\t0\t1\t0\t0\tNA\tNA
3 USA-1881-1\t81dcf24a-1e42-11e4-b4cd-db5882bf8def\t2\tUSA\tGarfield\t1881-03-04\t1881-09-19\tRegular\tIrregular\tAssassination by Unsupported Individual\t0\tDeath\tM\t1831\t1881\tNA\t1881-09-19\thttp://dbpedia.org/resource/James_A._Garfield\t0\t3\t11\t3\tNA\tNA
4                                         USA-1881-2\t81dcf24b-1e42-11e4-b4cd-db5882bf8def\t2\tUSA\tArthur\t1881-09-19\t1885-03-04\tRegular\tRegular\tRegular\t0\tOK\tM\t1829\t1886\tNA\t1886-11-18\thttp://dbpedia.org/resource/Chester_A._Arthur\t0\t1\t0\t0\tNA\tNA
5                                         USA-1885\t34fb1558-3bbd-11e5-afeb-eb6f07f9fec7\t2\tUSA\tCleveland\t1885-03-04\t1889-03-04\tRegular\tRegular\tRegular\t0\tOK\tM\t1837\t1908\tNA\t1908-06-24\thttp://dbpedia.org/resource/Grover_Cleveland\t0\t1\t0\t0\tNA\tNA
6                                         USA-1889\t81dcf24d-1e42-11e4-b4cd-db5882bf8def\t2\tUSA\tHarrison\t1889-03-04\t1893-03-04\tRegular\tRegular\tRegular\t0\tOK\tM\t1833\t1901\tNA\t1901-03-13\thttp://dbpedia.org/resource/Benjamin_Harrison\t0\t1\t0\t0\tNA\tNA
Code
# more efficient tool
library(data.table)
fread("http://ksgleditsch.com/data/1March_Archigos_4.1.txt", encoding = "Latin-1") |> 
  head()
        obsid                               leadid ccode idacr    leader
1:   USA-1869 81dcc176-1e42-11e4-b4cd-db5882bf8def     2   USA     Grant
2:   USA-1877 81dcc177-1e42-11e4-b4cd-db5882bf8def     2   USA     Hayes
3: USA-1881-1 81dcf24a-1e42-11e4-b4cd-db5882bf8def     2   USA  Garfield
4: USA-1881-2 81dcf24b-1e42-11e4-b4cd-db5882bf8def     2   USA    Arthur
5:   USA-1885 34fb1558-3bbd-11e5-afeb-eb6f07f9fec7     2   USA Cleveland
6:   USA-1889 81dcf24d-1e42-11e4-b4cd-db5882bf8def     2   USA  Harrison
    startdate    enddate   entry      exit
1: 1869-03-04 1877-03-04 Regular   Regular
2: 1877-03-04 1881-03-04 Regular   Regular
3: 1881-03-04 1881-09-19 Regular Irregular
4: 1881-09-19 1885-03-04 Regular   Regular
5: 1885-03-04 1889-03-04 Regular   Regular
6: 1889-03-04 1893-03-04 Regular   Regular
                                  exitcode prevtimesinoffice posttenurefate
1:                                 Regular                 0             OK
2:                                 Regular                 0             OK
3: Assassination by Unsupported Individual                 0          Death
4:                                 Regular                 0             OK
5:                                 Regular                 0             OK
6:                                 Regular                 0             OK
   gender yrborn yrdied borndate  deathdate
1:      M   1822   1885     <NA> 1885-07-23
2:      M   1822   1893     <NA> 1893-01-17
3:      M   1831   1881     <NA> 1881-09-19
4:      M   1829   1886     <NA> 1886-11-18
5:      M   1837   1908     <NA> 1908-06-24
6:      M   1833   1901     <NA> 1901-03-13
                                       dbpedia.uri num.entry num.exit
1:    http://dbpedia.org/resource/Ulysses_S._Grant         0        1
2: http://dbpedia.org/resource/Rutherford_B._Hayes         0        1
3:   http://dbpedia.org/resource/James_A._Garfield         0        3
4:   http://dbpedia.org/resource/Chester_A._Arthur         0        1
5:    http://dbpedia.org/resource/Grover_Cleveland         0        1
6:   http://dbpedia.org/resource/Benjamin_Harrison         0        1
   num.exitcode num.posttenurefate fties ftcur
1:            0                  0  <NA>    NA
2:            0                  0  <NA>    NA
3:           11                  3  <NA>    NA
4:            0                  0  <NA>    NA
5:            0                  0  <NA>    NA
6:            0                  0  <NA>    NA

5 Writing data to a file

Code
students <- read_csv("https://pos.it/r4ds-students-csv")
students
# A tibble: 6 × 5
  `Student ID` `Full Name`      favourite.food     mealPlan            AGE  
         <dbl> <chr>            <chr>              <chr>               <chr>
1            1 Sunil Huffmann   Strawberry yoghurt Lunch only          4    
2            2 Barclay Lynn     French fries       Lunch only          5    
3            3 Jayendra Lyne    N/A                Breakfast and lunch 7    
4            4 Leon Rossini     Anchovies          Lunch only          <NA> 
5            5 Chidiegwu Dunkel Pizza              Breakfast and lunch five 
6            6 Güvenç Attila    Ice cream          Lunch only          6    
Code
read.csv("https://pos.it/r4ds-students-csv", na = c("N/A", ""))
  Student.ID        Full.Name     favourite.food            mealPlan  AGE
1          1   Sunil Huffmann Strawberry yoghurt          Lunch only    4
2          2     Barclay Lynn       French fries          Lunch only    5
3          3    Jayendra Lyne               <NA> Breakfast and lunch    7
4          4     Leon Rossini          Anchovies          Lunch only <NA>
5          5 Chidiegwu Dunkel              Pizza Breakfast and lunch five
6          6    Güvenç Attila          Ice cream          Lunch only    6
Code
write.csv(students, "students.csv",row.names = FALSE)

6 Reading data from a file

Code
students2 <- read.csv("students.csv")

students2
  Student.ID        Full.Name     favourite.food            mealPlan  AGE
1          1   Sunil Huffmann Strawberry yoghurt          Lunch only    4
2          2     Barclay Lynn       French fries          Lunch only    5
3          3    Jayendra Lyne                N/A Breakfast and lunch    7
4          4     Leon Rossini          Anchovies          Lunch only <NA>
5          5 Chidiegwu Dunkel              Pizza Breakfast and lunch five
6          6    Güvenç Attila          Ice cream          Lunch only    6

6.1 Practical advice

Code
students |> 
  rename(student_id = `Student ID`,
         full_name = `Full Name`)
# A tibble: 6 × 5
  student_id full_name        favourite.food     mealPlan            AGE  
       <dbl> <chr>            <chr>              <chr>               <chr>
1          1 Sunil Huffmann   Strawberry yoghurt Lunch only          4    
2          2 Barclay Lynn     French fries       Lunch only          5    
3          3 Jayendra Lyne    N/A                Breakfast and lunch 7    
4          4 Leon Rossini     Anchovies          Lunch only          <NA> 
5          5 Chidiegwu Dunkel Pizza              Breakfast and lunch five 
6          6 Güvenç Attila    Ice cream          Lunch only          6    
Code
students |> 
  janitor::clean_names() |> 
  mutate(meal_plan = factor(meal_plan),
         age = parse_number(ifelse(age == "five", 5, age)))
# A tibble: 6 × 5
  student_id full_name        favourite_food     meal_plan             age
       <dbl> <chr>            <chr>              <fct>               <dbl>
1          1 Sunil Huffmann   Strawberry yoghurt Lunch only              4
2          2 Barclay Lynn     French fries       Lunch only              5
3          3 Jayendra Lyne    N/A                Breakfast and lunch     7
4          4 Leon Rossini     Anchovies          Lunch only             NA
5          5 Chidiegwu Dunkel Pizza              Breakfast and lunch     5
6          6 Güvenç Attila    Ice cream          Lunch only              6

6.2 Creating a data frame by yourself

Code
testdata <- data.frame( col1 = c(1, 2, 3),
            col2 = c("a", "b", "c"),
            col3 = c(TRUE, FALSE, TRUE))
testdata
  col1 col2  col3
1    1    a  TRUE
2    2    b FALSE
3    3    c  TRUE

7 Importing data from Excel

Code
readxl::read_excel("ANOVA.xlsx") |> 
  head()
# A tibble: 6 × 7
    Gen Gen2  FirstLang FirstLang2   SES SES2  Grade
  <dbl> <chr>     <dbl> <chr>      <dbl> <chr> <dbl>
1     1 M             1 English        1 Low       2
2     1 M             1 English        1 Low       3
3     1 M             2 NonEnglish     1 Low       6
4     1 M             1 English        2 Mid       8
5     2 F             1 English        1 Low       9
6     1 M             1 English        3 High      8


Thank you!