Pen & Paper


Getting Data with R

08 Jan 2015

Reading url

if (!file.exists("testdata")) {
  dir.create("testdata")    
}
fileUrl <- "http://data.bltimorecity.gov/api/views/dz54-2aru/rows.csv?accessType=DOWNLOAD"
download.file(fileUrl, destfile="./cam.csv", method="curl")
list.files("./")

dateDownload.cam <- date()

Reading Flat Files

read.table()
    file, header, sep, row.names, nrows
read.csv()
read.csv2()
    quote="", na.strings, nrows, skip

Reading Excel Files

library("xlsx")
read.xlsx("./cam.xlsx", sheetIndex=1, header=TRUE)
    colIndex <- 2:3
    rowIndex <- 1:5
    
also,
write.xlsx()
read.xlsx2()
XLConnect()
Notes for installing package “xlsx”
  1. may need to reconfigure “java” if installation fails
  2. if more than one version of java runtime, use sudo update-alternatives --config java to choose the default version
  3. reconfigure R to use the default version sudo R CMD javareconf

Reading XML

library("XML")
doc <- xmlTreeParse(fileUrl, useInternal=TRUE)
rootNode <-xmlRoot(doc)
xmlName(rootNode)

rootNode[[1]]          #Double [] to retrieve item of a list
rootNode[[1]][[1]]

Using xmlSApply to extract

xmlSApply(rootNode, xmlValue)

XPath

/node   # Top level node
//node  # at any level
node[@attr-name="bob"] # node with attrible name='bob'

xpathSApply(rootNode, "//name", xmlValue)
xpathSApply(rootNode, "//price", xmlValue)

Ex:
fileUrl <- "http://espn.go.com/nfl/team/_/name/bal/baltimore-ravens"
doc <- htmlTreeParse(fileUrl, useInternal=TRUE)   # html instead of xml
scores <- xpathSApply(doc, "//li[@class='score']", xmlValue)
teams <- xpathSApply(doc, "//li[@class='team-name']", xmlValue)
scores
teams
refs:
  1. Extracting data from XML
  2. Short Into to XML Pkg

Reading JSON Files

library("jsonlite")
jsonData <- fromJSON("https://api.github.com/users/jtleek/repos")
names(jsonData)

names(jsonData$owner)
names(jsonData$owner$login)
myjson <- toJSON(iris, pretty=TRUE)

iris2 <- fromJSON(myjson)
head(iris2)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
ref:
  1. R-blogger jsonlite

Reading mySQL

library("RMySQL")

dbfile <-dbConnect(MySQL(), user="username", host="localhost")
dbData <-dbGetQuery(dbfile, "show databases;")
dbDisconnect(dbfile)

also,
    dbListTables
    dbListFields
    dbReadTable
    dbSendQuery
    fetch

Reference
Leek, J; Peng, R & Caffo, B (2015). “Getting and Cleaning Data” [Lecture Slides]. Retrieved from https://d396qusza40orc.cloudfront.net/getdata/lecture_slides/