2017-04-27 23:52:11
pi
datasets
包: cars, mtcars, airquality, faithful
MASS
包: UScrime, Boston, Animal
ggplot2
包: diamonds, economics
data
data(<数据集>)
str(<数据集>)
View(<数据集>)
> data(cars) > str(cars) 'data.frame': 50 obs. of 2 variables: $ speed: num 4 4 7 7 8 9 10 10 10 11 ... $ dist : num 2 10 4 22 16 10 18 26 34 17 ...
我从Outlook打开一个Excel附件,突然想拿出其中的一部分数据跑个时间序列分析。 但这个文件和分析都用于初步探索,不需要保存。我也比较懒,不高兴费劲另存、读取、选片再分析。
base
包read.table
read.table(file, header = FALSE, sep = "", quote = "\"'", dec = ".", numerals = c("allow.loss", "warn.loss", "no.loss"), row.names, col.names, as.is = !stringsAsFactors, na.strings = "NA", colClasses = NA, nrows = -1, skip = 0, check.names = TRUE, fill = !blank.lines.skip, strip.white = FALSE, blank.lines.skip = TRUE, comment.char = "#", allowEscapes = FALSE, flush = FALSE, stringsAsFactors = default.stringsAsFactors(), fileEncoding = "", encoding = "unknown", text, skipNul = FALSE)
方法: 两步:
read.table(file="clipboard")
,或read.table(text=readClipboard())
readClipboard
是utils
包的函数,读取后返回的是一个制表符('\t')分隔的字符串退出时,R会询问是否保存工作空间(workspace)的镜像(image)。什么是镜像?
load()
函数加载,即可读入该镜像文件内封存的所有对象。(当前的同名对象会被覆盖)。readRDS
函数读入readr
包read_csv()
read_csv(file, col_names = TRUE, col_types = NULL, locale = default_locale(), na = c("", "NA"), quoted_na = TRUE, comment = "", trim_ws = TRUE, skip = 0, n_max = Inf, guess_max = min(1000, n_max), progress = interactive())
> library(readr) # 载入包 > csv.file <- system.file("extdata/challenge.csv", package="readr") # 指定文件路径 > data <- read_csv(csv.file, col_types="nD") # 读入数据 > str(data) # 看结构 Classes 'tbl_df' 'tbl' and 'data.frame': 2000 obs. of 2 variables: $ x: num 404 4172 3004 787 37 ... $ y: Date, format: NA NA NA NA ...
也可以用自带函数base::read.csv
> data <- read.csv(csv.file, colClasses=c("numeric", "Date")) > str(data) 'data.frame': 2000 obs. of 2 variables: $ x: num 404 4172 3004 787 37 ... $ y: Date, format: NA NA NA NA ...
但read_csv
更好:
推荐始终用csv给R传递数据
readxl
包read_excel()
read_excel(path, sheet = 1, col_names = TRUE, col_types = NULL, na = "", skip = 0)
> datasets <- system.file("extdata/datasets.xlsx", package = "readxl") > data <- read_excel(datasets) > str(data) Classes 'tbl_df' 'tbl' and 'data.frame': 150 obs. of 5 variables: $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ... $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ... $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ... $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ... $ Species : chr "setosa" "setosa" "setosa" "setosa" ...
其它可选包
openxlsx
, xlsx
: 读xlsx文件XLConnect
: 读xls文件RODBC
: (除了读ODBC驱动的数据库外) 读xls文件但它们都不如readxl
好用
libxls
和RapidXML
,执行速度更快传统三大统计软件SAS, SPSS, Stata都可以和R无缝切换
haven
包
read_sas
read_sav
/ read_por
read_spss
read_dta
read_stata
foreign
包
read.epiinfo
read.dbf
read.mtp
read.octave
read.systat
read.S
read.arff
RODBC
包 (仅限Windows) - by Brian RipleyDBI
家族 - by Hadley Wickham
RJDBC
: 提供JDBC引擎接口ROracle
: OracleRMySQL
: MySQLRSQLServer
: MS SQL ServerRPostgreSQL
: PostgreSQLRSQLite
: SQLiteRODBC
包读取Access数据库
> library(RODBC) # 1. 加载RODBC > db <- "ProvCity.accdb" # 2. 数据库位置(dsn) > con <- odbcConnectAccess2007(db) # 3. 建立数据库连接 > tbl <- sqlTables(con, tableType = "Table") # 4. 数据库所有非系统表 > tbl$TABLE_NAME # 5. 打印表名 [1] "CaptCity" "City" "Province" > prov <- sqlFetch(con, 'Province') # 6. 读取Province表 > head(Prov, 2) # 7. Prov前2行 ProvID ProvCode Province ProvAlias ProvNick CountryID ProvinceEN IsCapital 1 1 11 北京市 北京 京 48 Beijing 1 2 2 12 天津市 天津 津 48 Tianjin 0 > qry <- paste("select Province.ProvAlias as Prov, count(City.CityID) as N", + "from Province inner join City on Province.ProvID = City.ProvID", + "group by Province.ProvAlias;") # 8. 统计各省城市数的查询 > city.n <- sqlQuery(con, qry) # 9. 运行查询 > head(city.n, 2) # 10. 查看前2行结果 Prov N 1 安徽 17 2 北京 2 > odbcClose(con) # 11. 关闭数据库连接
readr::read_lines
read_lines(file, skip = 0, n_max = -1L, locale = default_locale(), na = character(), progress = interactive())
> txt.file <- system.file("extdata/epa78.txt", package="readr") > data <- read_lines(txt.file) > str(data) chr [1:20] "ALFA ROMEO ALFA ROMEO 78010003"...
readr::read_fwf
read_fwf(file, col_positions, col_types = NULL, locale = default_locale(), na = c("", "NA"), comment = "", skip = 0, n_max = Inf, guess_max = min(n_max, 1000), progress = interactive())
> txt.file <- system.file("extdata/epa78.txt", package="readr") > data <- read_fwf(txt.file, fwf_positions( + c(1, 38, 41, 45, 51, 54, 58, 62, 67, 70, 80, 101), + c(37, 40, 44, 50, 53, 57, 61, 66, 69, 79, 100, 108))) > head(data, 2) X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 <chr> <chr> <int> <int> <int> <int> <int> <int> <int> <int> <chr> <int> 1 ALFA ROMEO <NA> NA NA NA NA NA NA NA NA ALFA ROMEO 78010003 2 ALFETTA 03 81 8 74 7 89 9 NA NA ALFETTA 78010053
readr::read_log
read_log(file, col_names = FALSE, col_types = NULL, skip = 0, n_max = -1, progress = interactive())
> log.file <- system.file("extdata/compound.log", package="readr") > str(read_log(log.file)) Classes 'tbl_df' 'tbl' and 'data.frame': 2 obs. of 9 variables: $ X1: chr "74.133.75.225" "162.13.87.136" $ X2: chr NA NA $ X3: chr NA NA $ X4: chr "01/Jan/2015:15:15:20 -0500" "01/Jan/2015:15:15:16 -0500" $ X5: chr "GET blah/ HTTP/1.0" "GET / HTTP/1.0" $ X6: int 200 200 $ X7: int 202 6934 $ X8: chr NA NA $ X9: chr "R (3.1.2 x86_64-apple-darwin13.4.0 x86_64 darwin13.4.0)..."
readr::read_lines
==>
读作页面代码文本
> data <- read_lines("https://finance.yahoo.com") > str(data) chr [1:104] "<!DOCTYPE html><html id=\"atomic\" class=\"NoJs desktop\" lang=\"en-US\"><head prefix=\"og: http://ogp.me/ns#\"><script>window."| __truncated__ ...
XML
包rvest
、curl
和RCurl
> library(XML) # 加载XML包 > data <- readHTMLTable("A02_02_io_files/file/BOC EXCHANGE RATE(new).html", header = TRUE) # 读取保存到本地的页面文件 > str(data[[8]]) # 共获得23个列表,我们要的是其中第8个 'data.frame': 27 obs. of 7 variables: $ Currency Name : Factor w/ 27 levels "AED","AUD","BRL",..: 1 2 3 4 5 6 7 8 9 10 ... $ Buying Rate : Factor w/ 22 levels "","0.5887","10.87",..: 1 11 1 12 14 22 16 20 21 1 ... $ Cash Buying Rate : Factor w/ 27 levels "","0.05","0.568",..: 8 15 10 17 18 27 20 24 25 2 ... $ Selling Rate : Factor w/ 22 levels "","0.5935","10.95",..: 1 11 1 12 14 22 16 20 21 1 ... $ Cash Selling Rate: Factor w/ 27 levels "","0.0536","0.615",..: 8 15 11 16 19 27 21 24 25 2 ... $ Middle Rate : Factor w/ 27 levels "0.0517","0.5929",..: 8 16 11 17 19 27 21 25 26 1 ... $ Pub Time : Factor w/ 1 level "2016-12-09 \n\t\t10:07:37": 1 1 1 1 1 1 1 1 1 1 ...
XML
/ xml2
包jsonlite
包jpeg
包png
包maptools
/ sp
包save
save(..., list = character(), file = stop("'file' must be specified"), ascii = FALSE, version = NULL, envir = parent.frame(), compress = isTRUE(!ascii), compression_level, eval.promises = TRUE, precheck = TRUE)
save(c(<obj1>, <obj2>), file="~/image.RData")
save.image
是其快捷形式,将当前环境下所有对象都存入目标文件.RData。readxl::write_csv
write.csv(x, file = "", append = FALSE, quote = TRUE, sep = " ", eol = "\n", na = "NA", dec = ".", row.names = TRUE, col.names = TRUE, qmethod = c("escape", "double"), fileEncoding = "")
write.csv(<data.frame>, file="~/target.csv")
openxlsx::write.xlsx
write.xlsx(x, file, asTable = FALSE, ...)
write.xlsx(<data.frame>, file="~/target.xlsx")
create table ...
alter table ...
drop table ...
insert into table (...) ...
delete ... from table ...
update table set ...
Thank you!