2017-04-22 20:38:45
dplyr的处理思路来源于化整为零和SQL,提供了数据整理的统一框架
magrittr
包的pipe算符%>%
将整形步骤串联为单句命令DBI
包将dplyr
语法套用到关系型数据库tbl_df
函数> tbl_df(iris) # A tibble: 150 × 5 Sepal.Length Sepal.Width Petal.Length Petal.Width Species <dbl> <dbl> <dbl> <dbl> <fctr> 1 5.1 3.5 1.4 0.2 setosa 2 4.9 3.0 1.4 0.2 setosa 3 4.7 3.2 1.3 0.2 setosa 4 4.6 3.1 1.5 0.2 setosa 5 5.0 3.6 1.4 0.2 setosa 6 5.4 3.9 1.7 0.4 setosa 7 4.6 3.4 1.4 0.3 setosa 8 5.0 3.4 1.5 0.2 setosa 9 4.4 2.9 1.4 0.2 setosa 10 4.9 3.1 1.5 0.1 setosa # ... with 140 more rows
select
函数选列> select(tbl_df(iris), starts_with("Petal")) # A tibble: 150 × 2 Petal.Length Petal.Width <dbl> <dbl> 1 1.4 0.2 2 1.4 0.2 3 1.3 0.2 4 1.5 0.2 5 1.4 0.2 6 1.7 0.4 7 1.4 0.3 8 1.5 0.2 9 1.4 0.2 10 1.5 0.1 # ... with 140 more rows
> select(tbl_df(iris), -contains("Petal")) # A tibble: 150 × 3 Sepal.Length Sepal.Width Species <dbl> <dbl> <fctr> 1 5.1 3.5 setosa 2 4.9 3.0 setosa 3 4.7 3.2 setosa 4 4.6 3.1 setosa 5 5.0 3.6 setosa 6 5.4 3.9 setosa 7 4.6 3.4 setosa 8 5.0 3.4 setosa 9 4.4 2.9 setosa 10 4.9 3.1 setosa # ... with 140 more rows
select
函数选列(续)> select(tbl_df(iris), `Petal_Len`=Petal.Length) # A tibble: 150 × 1 Petal_Len <dbl> 1 1.4 2 1.4 3 1.3 4 1.5 5 1.4 6 1.7 7 1.4 8 1.5 9 1.4 10 1.5 # ... with 140 more rows
> select(tbl_df(iris), `Petal_Len`=Petal.Length, + everything()) # A tibble: 150 × 5 Petal_Len Sepal.Length Sepal.Width Petal.Width Species <dbl> <dbl> <dbl> <dbl> <fctr> 1 1.4 5.1 3.5 0.2 setosa 2 1.4 4.9 3.0 0.2 setosa 3 1.3 4.7 3.2 0.2 setosa 4 1.5 4.6 3.1 0.2 setosa 5 1.4 5.0 3.6 0.2 setosa 6 1.7 5.4 3.9 0.4 setosa 7 1.4 4.6 3.4 0.3 setosa 8 1.5 5.0 3.4 0.2 setosa 9 1.4 4.4 2.9 0.2 setosa 10 1.5 4.9 3.1 0.1 setosa # ... with 140 more rows
select_helpers
函数根据变量名选择
starts_with()
: starts with a prefixends_with()
: ends with a prefixcontains()
: contains a literal stringmatches()
: matches a regular expressionnum_range()
: a numerical range like x01, x02, x03.one_of()
: variables in character vector.everything()
: all variables.filter
函数筛行filter(.data, ...)
> filter(tbl_df(iris), Petal.Width > 2 & Species == 'virginica') # A tibble: 23 × 5 Sepal.Length Sepal.Width Petal.Length Petal.Width Species <dbl> <dbl> <dbl> <dbl> <fctr> 1 6.3 3.3 6.0 2.5 virginica 2 7.1 3.0 5.9 2.1 virginica 3 6.5 3.0 5.8 2.2 virginica 4 7.6 3.0 6.6 2.1 virginica 5 7.2 3.6 6.1 2.5 virginica 6 6.8 3.0 5.5 2.1 virginica 7 5.8 2.8 5.1 2.4 virginica 8 6.4 3.2 5.3 2.3 virginica 9 7.7 3.8 6.7 2.2 virginica 10 7.7 2.6 6.9 2.3 virginica # ... with 13 more rows
slice
函数切片slice(.data, ...)
> slice(tbl_df(iris), 1:2) # A tibble: 2 × 5 Sepal.Length Sepal.Width Petal.Length Petal.Width Species <dbl> <dbl> <dbl> <dbl> <fctr> 1 5.1 3.5 1.4 0.2 setosa 2 4.9 3.0 1.4 0.2 setosa > slice(group_by(iris, Species), 1:2) Source: local data frame [6 x 5] Groups: Species [3] Sepal.Length Sepal.Width Petal.Length Petal.Width Species <dbl> <dbl> <dbl> <dbl> <fctr> 1 5.1 3.5 1.4 0.2 setosa 2 4.9 3.0 1.4 0.2 setosa 3 7.0 3.2 4.7 1.4 versicolor 4 6.4 3.2 4.5 1.5 versicolor 5 6.3 3.3 6.0 2.5 virginica 6 5.8 2.7 5.1 1.9 virginica
arrange
函数排序arrange(.data, ...)
> arrange(tbl_df(iris), Sepal.Length, desc(Sepal.Width)) # A tibble: 150 × 5 Sepal.Length Sepal.Width Petal.Length Petal.Width Species <dbl> <dbl> <dbl> <dbl> <fctr> 1 4.3 3.0 1.1 0.1 setosa 2 4.4 3.2 1.3 0.2 setosa 3 4.4 3.0 1.3 0.2 setosa 4 4.4 2.9 1.4 0.2 setosa 5 4.5 2.3 1.3 0.3 setosa 6 4.6 3.6 1.0 0.2 setosa 7 4.6 3.4 1.4 0.3 setosa 8 4.6 3.2 1.4 0.2 setosa 9 4.6 3.1 1.5 0.2 setosa 10 4.7 3.2 1.3 0.2 setosa # ... with 140 more rows
mutate
函数创建变量mutate(.data, ...)
> mutate(tbl_df(iris), Sepal.LWR = Sepal.Length/Sepal.Width) # A tibble: 150 × 6 Sepal.Length Sepal.Width Petal.Length Petal.Width Species Sepal.LWR <dbl> <dbl> <dbl> <dbl> <fctr> <dbl> 1 5.1 3.5 1.4 0.2 setosa 1.457143 2 4.9 3.0 1.4 0.2 setosa 1.633333 3 4.7 3.2 1.3 0.2 setosa 1.468750 4 4.6 3.1 1.5 0.2 setosa 1.483871 5 5.0 3.6 1.4 0.2 setosa 1.388889 6 5.4 3.9 1.7 0.4 setosa 1.384615 7 4.6 3.4 1.4 0.3 setosa 1.352941 8 5.0 3.4 1.5 0.2 setosa 1.470588 9 4.4 2.9 1.4 0.2 setosa 1.517241 10 4.9 3.1 1.5 0.1 setosa 1.580645 # ... with 140 more rows
summarise
函数汇总summarize(.data, ...)
> summarise(iris, n=n(), sepal_width=median(Sepal.Width)) n sepal_width 1 150 3 > summarise(group_by(iris, Species), + m=mean(Sepal.Length), sd=sd(Sepal.Length)) # A tibble: 3 × 3 Species m sd <fctr> <dbl> <dbl> 1 setosa 5.006 0.3524897 2 versicolor 5.936 0.5161711 3 virginica 6.588 0.6358796
%>%
magrittr::
%>%`x %>% f
==> f(x)
x %>% f(y)
==> f(x, y)
y %>% f(x, .)
==> f(x, y)
z %>% f(x, y, arg = .)
==> f(x, y, arg = z)
%>%
,将数据整形步骤一体化> iris %>% head(1) # head(iris, 1) Sepal.Length Sepal.Width Petal.Length Petal.Width Species 1 5.1 3.5 1.4 0.2 setosa > 1:10 %>% {c(min(.), max(.))} # c(min(1:10), max(1:10)) [1] 1 10
reshape2::melt
> library(reshape2) > iris.new1 <- melt(iris, id.var='Species') > head(iris.new1) Species variable value 1 setosa Sepal.Length 5.1 2 setosa Sepal.Length 4.9 3 setosa Sepal.Length 4.7 4 setosa Sepal.Length 4.6 5 setosa Sepal.Length 5.0 6 setosa Sepal.Length 5.4
tidyr::gather
> library(tidyr) > iris.new2 <- gather(iris, Measure, Value, -Species) > head(iris.new2) Species Measure Value 1 setosa Sepal.Length 5.1 2 setosa Sepal.Length 4.9 3 setosa Sepal.Length 4.7 4 setosa Sepal.Length 4.6 5 setosa Sepal.Length 5.0 6 setosa Sepal.Length 5.4
reshape2::dcast
是melt
的逆算法,也可用于汇总分析。> library(reshape2) > iris.new1$id <- rep(1:50, 12) > iris.cast1 <- dcast(iris.new1, id+Species~variable, mean) > head(iris.cast1) id Species Sepal.Length Sepal.Width Petal.Length Petal.Width 1 1 setosa 5.1 3.5 1.4 0.2 2 1 versicolor 7.0 3.2 4.7 1.4 3 1 virginica 6.3 3.3 6.0 2.5 4 2 setosa 4.9 3.0 1.4 0.2 5 2 versicolor 6.4 3.2 4.5 1.5 6 2 virginica 5.8 2.7 5.1 1.9
tidyr::spread
是gather
的逆算法,但要求key:value pair的唯一性> library(tidyr) > iris.new2$Species <- paste(rep(1:50, 12), + iris.new2$Species, sep=".") > iris.cast2 <- spread(iris.new2, Measure, Value) > iris.cast2$Species <- gsub("^\\d+\\.(.+)$", "\\1", + iris.cast2$Species) > head(iris.cast2) Species Petal.Length Petal.Width Sepal.Length Sepal.Width 1 setosa 1.4 0.2 5.1 3.5 2 versicolor 4.7 1.4 7.0 3.2 3 virginica 6.0 2.5 6.3 3.3 4 setosa 1.5 0.1 4.9 3.1 5 versicolor 3.9 1.4 5.2 2.7 6 virginica 6.1 2.5 7.2 3.6
选取iris数据集Sepal.Length > 6的记录,构造一个指标SPWR (Sepal.Width与Petal.Width之比)。 按Species分组,分析SPWR的均数及标准差。
group_by(Species)
,构造分组tbl_dffilter(Sepal.Length > 6)
,筛选合格的记录mutate(SPWR=Sepal.Width / Petal.Width)
,构造次生变量summarise(mean(SPWR), sd(SPWR))
,计算指标> iris %>% group_by(Species) %>% filter(Sepal.Length > 6) %>% + select(contains('Width')) %>% mutate(SPWR=Sepal.Width / Petal.Width) %>% + summarise(m=mean(SPWR), sd=sd(SPWR)) # A tibble: 2 × 3 Species m sd <fctr> <dbl> <dbl> 1 versicolor 2.044987 0.2271170 2 virginica 1.494320 0.1874827
DT[i, j, by]
: 对于数据集DT,选取子集i行,通过by对j列分组计算> library(data.table) > data.table(iris) Sepal.Length Sepal.Width Petal.Length Petal.Width Species 1: 5.1 3.5 1.4 0.2 setosa 2: 4.9 3.0 1.4 0.2 setosa 3: 4.7 3.2 1.3 0.2 setosa 4: 4.6 3.1 1.5 0.2 setosa 5: 5.0 3.6 1.4 0.2 setosa --- 146: 6.7 3.0 5.2 2.3 virginica 147: 6.3 2.5 5.0 1.9 virginica 148: 6.5 3.0 5.2 2.0 virginica 149: 6.2 3.4 5.4 2.3 virginica 150: 5.9 3.0 5.1 1.8 virginica
fread
快速读入数据文件flights <- fread("flights14.csv") flights # year month day dep_delay arr_delay carrier origin dest air_time distance hour # 1: 2014 1 1 14 13 AA JFK LAX 359 2475 9 # 2: 2014 1 1 -3 13 AA JFK LAX 363 2475 11 # 3: 2014 1 1 2 9 AA JFK LAX 351 2475 19 # 4: 2014 1 1 -8 -26 AA LGA PBI 157 1035 7 # 5: 2014 1 1 2 1 AA JFK LAX 350 2475 13 # --- # 253312: 2014 10 31 1 -30 UA LGA IAH 201 1416 14 # 253313: 2014 10 31 -5 -14 UA EWR IAH 189 1400 8 # 253314: 2014 10 31 -8 16 MQ LGA RDU 83 431 11 # 253315: 2014 10 31 -4 15 MQ LGA DTW 75 502 11 # 253316: 2014 10 31 -5 1 MQ LGA SDF 110 659 8 dim(flights) # [1] 253316 11
R: | i | j | by |
---|---|---|---|
SQL: | where | select | update | group by |
data.table的基本形式为
DT[i, j, by]
相当于SQL
select `j` from `DT` where `i` group by `by`
可以理解为
读取
DT
, 以i
筛选行, 然后计算j
, 按by
分组.
i
> data.table(iris)[1:2] Sepal.Length Sepal.Width Petal.Length Petal.Width Species 1: 5.1 3.5 1.4 0.2 setosa 2: 4.9 3.0 1.4 0.2 setosa
on
: 复合筛选Sepal.Width == 3.1, Species == 'setosa'的记录> data.table(iris)[.(3.1, 'setosa'), on=.(Sepal.Width, Species)] Sepal.Length Sepal.Width Petal.Length Petal.Width Species 1: 4.6 3.1 1.5 0.2 setosa 2: 4.9 3.1 1.5 0.1 setosa 3: 4.8 3.1 1.6 0.2 setosa 4: 4.9 3.1 1.5 0.2 setosa
> data.table(iris)[[1]] # 或 data.table(iris)[, Sepal.Length] [1] 5.1 4.9 4.7 4.6 5.0 5.4 4.6 5.0 ...
j
取list()
或 .()
> data.table(iris)[,1:2] > data.table(iris)[, list(Sepal.Length, Sepal.Width)], > data.table(iris)[, .(Sepal.Length, Sepal.Width)] Sepal.Length Sepal.Width 1: 5.1 3.5 --- 150: 5.9 3.0
,with=FALSE
> data.table(iris)[, seq(1, 5, 2), with=FALSE] Sepal.Length Petal.Length Species 1: 5.1 1.4 setosa --- 150: 5.9 5.1 virginica
subset
> subset(data.table(iris), select = startsWith( + names(iris), "Petal")) > data.table(iris)[, .(Petal.Length, Petal.Width)] Petal.Length Petal.Width 1: 1.4 0.2 2: 1.4 0.2 3: 1.3 0.2 4: 1.5 0.2 5: 1.4 0.2 --- 146: 5.2 2.3 147: 5.0 1.9 148: 5.2 2.0 149: 5.4 2.3 150: 5.1 1.8
> subset(data.table(iris), select = !grepl( + "Petal", names(iris))) > data.table(iris)[, -(3:4)] > data.table(iris)[, c("Petal.Width", + "Petal.Length") :=NULL] Sepal.Length Sepal.Width Species 1: 5.1 3.5 setosa 2: 4.9 3.0 setosa 3: 4.7 3.2 setosa 4: 4.6 3.1 setosa 5: 5.0 3.6 setosa --- 146: 6.7 3.0 virginica 147: 6.3 2.5 virginica 148: 6.5 3.0 virginica 149: 6.2 3.4 virginica 150: 5.9 3.0 virginica
重命名某些列
setnames
> d <- data.table(iris) > setnames(d, 1:4, c("SL", "SW", "PL", "PW")) > d SL SW PL PW Species 1: 4.3 3.0 1.1 0.1 setosa 2: 4.4 3.2 1.3 0.2 setosa 3: 4.4 3.0 1.3 0.2 setosa 4: 4.4 2.9 1.4 0.2 setosa 5: 4.5 2.3 1.3 0.3 setosa --- 146: 7.7 3.8 6.7 2.2 virginica 147: 7.7 3.0 6.1 2.3 virginica 148: 7.7 2.8 6.7 2.0 virginica 149: 7.7 2.6 6.9 2.3 virginica 150: 7.9 3.8 6.4 2.0 virginica
重组列的次序
setcolorder
> setcolorder(d, c(5, 1:4)) > d Species PW SL SW PL 1: setosa 0.1 4.3 3.0 1.1 2: setosa 0.2 4.4 3.2 1.3 3: setosa 0.2 4.4 3.0 1.3 4: setosa 0.2 4.4 2.9 1.4 5: setosa 0.3 4.5 2.3 1.3 --- 146: virginica 2.2 7.7 3.8 6.7 147: virginica 2.3 7.7 3.0 6.1 148: virginica 2.0 7.7 2.8 6.7 149: virginica 2.3 7.7 2.6 6.9 150: virginica 2.0 7.9 3.8 6.4
> data.table(iris)[Petal.Width > 2 & Species == 'virginica'] Sepal.Length Sepal.Width Petal.Length Petal.Width Species 1: 6.3 3.3 6.0 2.5 virginica 2: 7.1 3.0 5.9 2.1 virginica 3: 6.5 3.0 5.8 2.2 virginica 4: 7.6 3.0 6.6 2.1 virginica 5: 7.2 3.6 6.1 2.5 virginica ...... 19: 6.9 3.1 5.1 2.3 virginica 20: 6.8 3.2 5.9 2.3 virginica 21: 6.7 3.3 5.7 2.5 virginica 22: 6.7 3.0 5.2 2.3 virginica 23: 6.2 3.4 5.4 2.3 virginica Sepal.Length Sepal.Width Petal.Length Petal.Width Species
setorder
> data.table(iris)[order(Sepal.Length, -Sepal.Width)] > setorder(iris, Sepal.Length, -Sepal.Width) Sepal.Length Sepal.Width Petal.Length Petal.Width Species 1: 4.3 3.0 1.1 0.1 setosa 2: 4.4 3.2 1.3 0.2 setosa 3: 4.4 3.0 1.3 0.2 setosa 4: 4.4 2.9 1.4 0.2 setosa 5: 4.5 2.3 1.3 0.3 setosa --- 146: 7.7 3.8 6.7 2.2 virginica 147: 7.7 3.0 6.1 2.3 virginica 148: 7.7 2.8 6.7 2.0 virginica 149: 7.7 2.6 6.9 2.3 virginica 150: 7.9 3.8 6.4 2.0 virginica
:=
(仅限单个)> data.table(iris)[, Sepal.LWR := Sepal.Length/Sepal.Width] Sepal.Length Sepal.Width Petal.Length Petal.Width Species Sepal.LWR 1: 4.3 3.0 1.1 0.1 setosa 1.433333 2: 4.4 3.2 1.3 0.2 setosa 1.375000 --- 149: 7.7 2.6 6.9 2.3 virginica 2.961538 150: 7.9 3.8 6.4 2.0 virginica 2.078947
> data.table(iris)[, Sepal.LWR := Sepal.Length/Sepal.Width][ + , Petal.LWR := Petal.Length/Petal.Width)] Sepal.Length Sepal.Width Petal.Length Petal.Width Species Sepal.LWR Petal.LWR 1: 5.1 3.5 1.4 0.2 setosa 1.457143 7.000000 2: 4.9 3.0 1.4 0.2 setosa 1.633333 7.000000 --- 149: 6.2 3.4 5.4 2.3 virginica 1.823529 2.347826 150: 5.9 3.0 5.1 1.8 virginica 1.966667 2.833333
melt
和dcast
library(data.table)
后调用,如为data.table类则直接调用改写后的函数> iris.new1 <- melt(data.table(iris), id.var='Species') > head(iris.new1, 4) Species variable value 1: setosa Sepal.Length 4.3 2: setosa Sepal.Length 4.4 3: setosa Sepal.Length 4.4 4: setosa Sepal.Length 4.4
> iris.cast1 <- dcast(iris.new1, Species~variable, mean) > iris.cast1 Species Sepal.Length Sepal.Width Petal.Length Petal.Width 1: setosa 5.006 3.428 1.462 0.246 2: versicolor 5.936 2.770 4.260 1.326 3: virginica 6.588 2.974 5.552 2.026
j
整合计算.N
> data.table(iris)[, .N, by=Species] Species N 1: setosa 50 2: versicolor 50 3: virginica 50
.SD
: 代表按by
分拆的data.table们(.SDcols
指定分组变量)> data.table(iris)[, lapply(.SD, mean), by=Species, .SDcols=c("Sepal.Width", "Petal.Width")] Species Sepal.Width Petal.Width 1: setosa 3.428 0.246 2: versicolor 2.770 1.326 3: virginica 2.974 2.026
选取iris数据集Sepal.Length > 6的记录,构造一个指标SPWR (Sepal.Width与Petal.Width之比)。 按Species分组,分析SPWR的均数及标准差。
> d <- data.table(iris) > d[Sepal.Length > 6, SPWR := Sepal.Width / Petal.Width + ][, list(m = mean(SPWR, na.rm=TRUE), + sd = sd(SPWR, na.rm=TRUE)), by=Species] Species m sd 1: setosa NaN NA 2: versicolor 2.044987 0.2271170 3: virginica 1.494320 0.1874827
Thank you!