Read and Write Files From HDFS With R
-
Install the following packages:
library(httr) library(getPass)
-
The
httr
package is used to executecurl
requests in thewrite
function. -
The
getPass
package is used to hide passwords typed in RStudio apps.
-
Read Files From HDFS With R
-
Define the access parameters to your files as follows:
# WebHDFS URL hdfsUri <- "https://nn1.pX.company.prod.saagie.io:50470/webhdfs/v1" # Path to the file to read fileUri <- "/path/to/myfile.csv" # OPEN => Read a file readParameter <- "?op=OPEN" # Optional parameter in "&name1=value1&name2=value2" format optionnalParameters <- "" # Concatenate the parameters uri <- paste0(hdfsUri, fileUri, readParameter, optionnalParameters)
-
Download your file with or without Kerberos.
data <- read.csv(uri) print(data)
library(getPass) # Method 1 (interactive): Use in RStudio. # An interactive pop-up is used to enter the password. system('kinit user',input=getPass('Enter your password: ')) # Method 2 (scripts): Use outside RStudio. # The password is written in the command line or stored in an environment variable. # Uncomment the following line to use it. # system('echo password | kinit user') library(httr) set_config(config(ssl_verifypeer = 0L)) # Authenticate with Kerberos. auth <- authenticate(":","","gssnegotiate") # Fetch the file from a specified URL. response <- GET(uriSrc, auth) # The data is in the content of the response, as text data <- read.csv(content(response, 'text'))
Write Files From HDFS With R
-
Define the access parameters to your files as follows:
library(httr) # WebHDFS URL hdfsUri <- "https://nn1.pX.company.prod.saagie.io:50470/webhdfs/v1" # Path to the file to write fileUri <- "/path/to/myfile.csv" # OPEN => Read a file writeParameter <- "?op=CREATE" # Optional parameter in "&name1=value1&name2=value2" format optionnalParameters <- "&overwrite=true" # Concatenate the parameters uri <- paste0(hdfsUri, fileUri, writeParameter, optionnalParameters)
-
Write the temporary file locally.
write.csv(data, row.names = F, file = "my_local_file.csv")
-
Upload your file with or without Kerberos.
-
Ask the NameNode on which DataNode to write the file:
# To ask the NameNode on which DataNode to write the file. response <- PUT(uri) # To get the URL of the DataNode returned by HDFS. uriWrite <- response$url
-
Push the file:
# To upload the file with a "PUT" request. PUT(uriWrite, body = upload_file("my_local_file.csv"))
-
Ask the NameNode on which DataNode to write the file:
set_config(config(ssl_verifypeer = 0L)) # To authenticate with Kerberos. auth <- authenticate(":","","gssnegotiate") # To ask the NameNode on which DataNode to write the file. response <- PUT(uriDest, auth) # To get the URL of the DataNode returned by HDFS. uriWrite <- response$url
-
Push the file:
# To upload the file with a "PUT" request. responseWrite <- PUT(uriWrite, auth, body = upload_file("tmp.csv"))
-
See also
-
Code example to read and write files from HDFS with R (GitHub Gist page)