2017-04-22 20:38:45
dplyr的处理思路来源于化整为零和SQL,提供了数据整理的统一框架
magrittr包的pipe算符%>%将整形步骤串联为单句命令DBI包将dplyr语法套用到关系型数据库tbl_df函数> tbl_df(iris)
# A tibble: 150 × 5
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
<dbl> <dbl> <dbl> <dbl> <fctr>
1 5.1 3.5 1.4 0.2 setosa
2 4.9 3.0 1.4 0.2 setosa
3 4.7 3.2 1.3 0.2 setosa
4 4.6 3.1 1.5 0.2 setosa
5 5.0 3.6 1.4 0.2 setosa
6 5.4 3.9 1.7 0.4 setosa
7 4.6 3.4 1.4 0.3 setosa
8 5.0 3.4 1.5 0.2 setosa
9 4.4 2.9 1.4 0.2 setosa
10 4.9 3.1 1.5 0.1 setosa
# ... with 140 more rows
select函数选列> select(tbl_df(iris), starts_with("Petal"))
# A tibble: 150 × 2
Petal.Length Petal.Width
<dbl> <dbl>
1 1.4 0.2
2 1.4 0.2
3 1.3 0.2
4 1.5 0.2
5 1.4 0.2
6 1.7 0.4
7 1.4 0.3
8 1.5 0.2
9 1.4 0.2
10 1.5 0.1
# ... with 140 more rows
> select(tbl_df(iris), -contains("Petal"))
# A tibble: 150 × 3
Sepal.Length Sepal.Width Species
<dbl> <dbl> <fctr>
1 5.1 3.5 setosa
2 4.9 3.0 setosa
3 4.7 3.2 setosa
4 4.6 3.1 setosa
5 5.0 3.6 setosa
6 5.4 3.9 setosa
7 4.6 3.4 setosa
8 5.0 3.4 setosa
9 4.4 2.9 setosa
10 4.9 3.1 setosa
# ... with 140 more rowsselect函数选列(续)> select(tbl_df(iris), `Petal_Len`=Petal.Length)
# A tibble: 150 × 1
Petal_Len
<dbl>
1 1.4
2 1.4
3 1.3
4 1.5
5 1.4
6 1.7
7 1.4
8 1.5
9 1.4
10 1.5
# ... with 140 more rows
> select(tbl_df(iris), `Petal_Len`=Petal.Length,
+ everything())
# A tibble: 150 × 5
Petal_Len Sepal.Length Sepal.Width Petal.Width Species
<dbl> <dbl> <dbl> <dbl> <fctr>
1 1.4 5.1 3.5 0.2 setosa
2 1.4 4.9 3.0 0.2 setosa
3 1.3 4.7 3.2 0.2 setosa
4 1.5 4.6 3.1 0.2 setosa
5 1.4 5.0 3.6 0.2 setosa
6 1.7 5.4 3.9 0.4 setosa
7 1.4 4.6 3.4 0.3 setosa
8 1.5 5.0 3.4 0.2 setosa
9 1.4 4.4 2.9 0.2 setosa
10 1.5 4.9 3.1 0.1 setosa
# ... with 140 more rowsselect_helpers函数根据变量名选择
starts_with(): starts with a prefixends_with(): ends with a prefixcontains(): contains a literal stringmatches(): matches a regular expressionnum_range(): a numerical range like x01, x02, x03.one_of(): variables in character vector.everything(): all variables.filter函数筛行filter(.data, ...)
> filter(tbl_df(iris), Petal.Width > 2 & Species == 'virginica')
# A tibble: 23 × 5
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
<dbl> <dbl> <dbl> <dbl> <fctr>
1 6.3 3.3 6.0 2.5 virginica
2 7.1 3.0 5.9 2.1 virginica
3 6.5 3.0 5.8 2.2 virginica
4 7.6 3.0 6.6 2.1 virginica
5 7.2 3.6 6.1 2.5 virginica
6 6.8 3.0 5.5 2.1 virginica
7 5.8 2.8 5.1 2.4 virginica
8 6.4 3.2 5.3 2.3 virginica
9 7.7 3.8 6.7 2.2 virginica
10 7.7 2.6 6.9 2.3 virginica
# ... with 13 more rows
slice函数切片slice(.data, ...)
> slice(tbl_df(iris), 1:2)
# A tibble: 2 × 5
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
<dbl> <dbl> <dbl> <dbl> <fctr>
1 5.1 3.5 1.4 0.2 setosa
2 4.9 3.0 1.4 0.2 setosa
> slice(group_by(iris, Species), 1:2)
Source: local data frame [6 x 5]
Groups: Species [3]
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
<dbl> <dbl> <dbl> <dbl> <fctr>
1 5.1 3.5 1.4 0.2 setosa
2 4.9 3.0 1.4 0.2 setosa
3 7.0 3.2 4.7 1.4 versicolor
4 6.4 3.2 4.5 1.5 versicolor
5 6.3 3.3 6.0 2.5 virginica
6 5.8 2.7 5.1 1.9 virginica
arrange函数排序arrange(.data, ...)
> arrange(tbl_df(iris), Sepal.Length, desc(Sepal.Width))
# A tibble: 150 × 5
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
<dbl> <dbl> <dbl> <dbl> <fctr>
1 4.3 3.0 1.1 0.1 setosa
2 4.4 3.2 1.3 0.2 setosa
3 4.4 3.0 1.3 0.2 setosa
4 4.4 2.9 1.4 0.2 setosa
5 4.5 2.3 1.3 0.3 setosa
6 4.6 3.6 1.0 0.2 setosa
7 4.6 3.4 1.4 0.3 setosa
8 4.6 3.2 1.4 0.2 setosa
9 4.6 3.1 1.5 0.2 setosa
10 4.7 3.2 1.3 0.2 setosa
# ... with 140 more rows
mutate函数创建变量mutate(.data, ...)
> mutate(tbl_df(iris), Sepal.LWR = Sepal.Length/Sepal.Width)
# A tibble: 150 × 6
Sepal.Length Sepal.Width Petal.Length Petal.Width Species Sepal.LWR
<dbl> <dbl> <dbl> <dbl> <fctr> <dbl>
1 5.1 3.5 1.4 0.2 setosa 1.457143
2 4.9 3.0 1.4 0.2 setosa 1.633333
3 4.7 3.2 1.3 0.2 setosa 1.468750
4 4.6 3.1 1.5 0.2 setosa 1.483871
5 5.0 3.6 1.4 0.2 setosa 1.388889
6 5.4 3.9 1.7 0.4 setosa 1.384615
7 4.6 3.4 1.4 0.3 setosa 1.352941
8 5.0 3.4 1.5 0.2 setosa 1.470588
9 4.4 2.9 1.4 0.2 setosa 1.517241
10 4.9 3.1 1.5 0.1 setosa 1.580645
# ... with 140 more rows
summarise函数汇总summarize(.data, ...)
> summarise(iris, n=n(), sepal_width=median(Sepal.Width))
n sepal_width
1 150 3
> summarise(group_by(iris, Species),
+ m=mean(Sepal.Length), sd=sd(Sepal.Length))
# A tibble: 3 × 3
Species m sd
<fctr> <dbl> <dbl>
1 setosa 5.006 0.3524897
2 versicolor 5.936 0.5161711
3 virginica 6.588 0.6358796
%>%magrittr::%>%`x %>% f ==> f(x)x %>% f(y) ==> f(x, y)y %>% f(x, .) ==> f(x, y)z %>% f(x, y, arg = .) ==> f(x, y, arg = z)%>%,将数据整形步骤一体化> iris %>% head(1) # head(iris, 1)
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1 5.1 3.5 1.4 0.2 setosa
> 1:10 %>% {c(min(.), max(.))} # c(min(1:10), max(1:10))
[1] 1 10
reshape2::melt> library(reshape2) > iris.new1 <- melt(iris, id.var='Species') > head(iris.new1) Species variable value 1 setosa Sepal.Length 5.1 2 setosa Sepal.Length 4.9 3 setosa Sepal.Length 4.7 4 setosa Sepal.Length 4.6 5 setosa Sepal.Length 5.0 6 setosa Sepal.Length 5.4
tidyr::gather> library(tidyr) > iris.new2 <- gather(iris, Measure, Value, -Species) > head(iris.new2) Species Measure Value 1 setosa Sepal.Length 5.1 2 setosa Sepal.Length 4.9 3 setosa Sepal.Length 4.7 4 setosa Sepal.Length 4.6 5 setosa Sepal.Length 5.0 6 setosa Sepal.Length 5.4
reshape2::dcast是melt的逆算法,也可用于汇总分析。> library(reshape2) > iris.new1$id <- rep(1:50, 12) > iris.cast1 <- dcast(iris.new1, id+Species~variable, mean) > head(iris.cast1) id Species Sepal.Length Sepal.Width Petal.Length Petal.Width 1 1 setosa 5.1 3.5 1.4 0.2 2 1 versicolor 7.0 3.2 4.7 1.4 3 1 virginica 6.3 3.3 6.0 2.5 4 2 setosa 4.9 3.0 1.4 0.2 5 2 versicolor 6.4 3.2 4.5 1.5 6 2 virginica 5.8 2.7 5.1 1.9
tidyr::spread是gather的逆算法,但要求key:value pair的唯一性> library(tidyr)
> iris.new2$Species <- paste(rep(1:50, 12),
+ iris.new2$Species, sep=".")
> iris.cast2 <- spread(iris.new2, Measure, Value)
> iris.cast2$Species <- gsub("^\\d+\\.(.+)$", "\\1",
+ iris.cast2$Species)
> head(iris.cast2)
Species Petal.Length Petal.Width Sepal.Length Sepal.Width
1 setosa 1.4 0.2 5.1 3.5
2 versicolor 4.7 1.4 7.0 3.2
3 virginica 6.0 2.5 6.3 3.3
4 setosa 1.5 0.1 4.9 3.1
5 versicolor 3.9 1.4 5.2 2.7
6 virginica 6.1 2.5 7.2 3.6选取iris数据集Sepal.Length > 6的记录,构造一个指标SPWR (Sepal.Width与Petal.Width之比)。 按Species分组,分析SPWR的均数及标准差。
group_by(Species),构造分组tbl_dffilter(Sepal.Length > 6),筛选合格的记录mutate(SPWR=Sepal.Width / Petal.Width),构造次生变量summarise(mean(SPWR), sd(SPWR)),计算指标> iris %>% group_by(Species) %>% filter(Sepal.Length > 6) %>%
+ select(contains('Width')) %>% mutate(SPWR=Sepal.Width / Petal.Width) %>%
+ summarise(m=mean(SPWR), sd=sd(SPWR))
# A tibble: 2 × 3
Species m sd
<fctr> <dbl> <dbl>
1 versicolor 2.044987 0.2271170
2 virginica 1.494320 0.1874827
DT[i, j, by]: 对于数据集DT,选取子集i行,通过by对j列分组计算> library(data.table)
> data.table(iris)
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1: 5.1 3.5 1.4 0.2 setosa
2: 4.9 3.0 1.4 0.2 setosa
3: 4.7 3.2 1.3 0.2 setosa
4: 4.6 3.1 1.5 0.2 setosa
5: 5.0 3.6 1.4 0.2 setosa
---
146: 6.7 3.0 5.2 2.3 virginica
147: 6.3 2.5 5.0 1.9 virginica
148: 6.5 3.0 5.2 2.0 virginica
149: 6.2 3.4 5.4 2.3 virginica
150: 5.9 3.0 5.1 1.8 virginica
fread快速读入数据文件flights <- fread("flights14.csv")
flights
# year month day dep_delay arr_delay carrier origin dest air_time distance hour
# 1: 2014 1 1 14 13 AA JFK LAX 359 2475 9
# 2: 2014 1 1 -3 13 AA JFK LAX 363 2475 11
# 3: 2014 1 1 2 9 AA JFK LAX 351 2475 19
# 4: 2014 1 1 -8 -26 AA LGA PBI 157 1035 7
# 5: 2014 1 1 2 1 AA JFK LAX 350 2475 13
# ---
# 253312: 2014 10 31 1 -30 UA LGA IAH 201 1416 14
# 253313: 2014 10 31 -5 -14 UA EWR IAH 189 1400 8
# 253314: 2014 10 31 -8 16 MQ LGA RDU 83 431 11
# 253315: 2014 10 31 -4 15 MQ LGA DTW 75 502 11
# 253316: 2014 10 31 -5 1 MQ LGA SDF 110 659 8
dim(flights)
# [1] 253316 11
| R: | i | j | by |
|---|---|---|---|
| SQL: | where | select | update | group by |
data.table的基本形式为
DT[i, j, by]
相当于SQL
select `j` from `DT` where `i` group by `by`
可以理解为
读取
DT, 以i筛选行, 然后计算j, 按by分组.
i> data.table(iris)[1:2]
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1: 5.1 3.5 1.4 0.2 setosa
2: 4.9 3.0 1.4 0.2 setosa
on: 复合筛选Sepal.Width == 3.1, Species == 'setosa'的记录> data.table(iris)[.(3.1, 'setosa'), on=.(Sepal.Width, Species)] Sepal.Length Sepal.Width Petal.Length Petal.Width Species 1: 4.6 3.1 1.5 0.2 setosa 2: 4.9 3.1 1.5 0.1 setosa 3: 4.8 3.1 1.6 0.2 setosa 4: 4.9 3.1 1.5 0.2 setosa
> data.table(iris)[[1]] # 或 data.table(iris)[, Sepal.Length] [1] 5.1 4.9 4.7 4.6 5.0 5.4 4.6 5.0 ...
j取list() 或 .()> data.table(iris)[,1:2]
> data.table(iris)[, list(Sepal.Length, Sepal.Width)],
> data.table(iris)[, .(Sepal.Length, Sepal.Width)]
Sepal.Length Sepal.Width
1: 5.1 3.5
---
150: 5.9 3.0
,with=FALSE> data.table(iris)[, seq(1, 5, 2), with=FALSE]
Sepal.Length Petal.Length Species
1: 5.1 1.4 setosa
---
150: 5.9 5.1 virginica
subset> subset(data.table(iris), select = startsWith(
+ names(iris), "Petal"))
> data.table(iris)[, .(Petal.Length, Petal.Width)]
Petal.Length Petal.Width
1: 1.4 0.2
2: 1.4 0.2
3: 1.3 0.2
4: 1.5 0.2
5: 1.4 0.2
---
146: 5.2 2.3
147: 5.0 1.9
148: 5.2 2.0
149: 5.4 2.3
150: 5.1 1.8
> subset(data.table(iris), select = !grepl(
+ "Petal", names(iris)))
> data.table(iris)[, -(3:4)]
> data.table(iris)[, c("Petal.Width",
+ "Petal.Length") :=NULL]
Sepal.Length Sepal.Width Species
1: 5.1 3.5 setosa
2: 4.9 3.0 setosa
3: 4.7 3.2 setosa
4: 4.6 3.1 setosa
5: 5.0 3.6 setosa
---
146: 6.7 3.0 virginica
147: 6.3 2.5 virginica
148: 6.5 3.0 virginica
149: 6.2 3.4 virginica
150: 5.9 3.0 virginica重命名某些列
setnames> d <- data.table(iris)
> setnames(d, 1:4, c("SL", "SW", "PL", "PW"))
> d
SL SW PL PW Species
1: 4.3 3.0 1.1 0.1 setosa
2: 4.4 3.2 1.3 0.2 setosa
3: 4.4 3.0 1.3 0.2 setosa
4: 4.4 2.9 1.4 0.2 setosa
5: 4.5 2.3 1.3 0.3 setosa
---
146: 7.7 3.8 6.7 2.2 virginica
147: 7.7 3.0 6.1 2.3 virginica
148: 7.7 2.8 6.7 2.0 virginica
149: 7.7 2.6 6.9 2.3 virginica
150: 7.9 3.8 6.4 2.0 virginica
重组列的次序
setcolorder> setcolorder(d, c(5, 1:4))
> d
Species PW SL SW PL
1: setosa 0.1 4.3 3.0 1.1
2: setosa 0.2 4.4 3.2 1.3
3: setosa 0.2 4.4 3.0 1.3
4: setosa 0.2 4.4 2.9 1.4
5: setosa 0.3 4.5 2.3 1.3
---
146: virginica 2.2 7.7 3.8 6.7
147: virginica 2.3 7.7 3.0 6.1
148: virginica 2.0 7.7 2.8 6.7
149: virginica 2.3 7.7 2.6 6.9
150: virginica 2.0 7.9 3.8 6.4> data.table(iris)[Petal.Width > 2 & Species == 'virginica']
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1: 6.3 3.3 6.0 2.5 virginica
2: 7.1 3.0 5.9 2.1 virginica
3: 6.5 3.0 5.8 2.2 virginica
4: 7.6 3.0 6.6 2.1 virginica
5: 7.2 3.6 6.1 2.5 virginica
......
19: 6.9 3.1 5.1 2.3 virginica
20: 6.8 3.2 5.9 2.3 virginica
21: 6.7 3.3 5.7 2.5 virginica
22: 6.7 3.0 5.2 2.3 virginica
23: 6.2 3.4 5.4 2.3 virginica
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
setorder> data.table(iris)[order(Sepal.Length, -Sepal.Width)]
> setorder(iris, Sepal.Length, -Sepal.Width)
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1: 4.3 3.0 1.1 0.1 setosa
2: 4.4 3.2 1.3 0.2 setosa
3: 4.4 3.0 1.3 0.2 setosa
4: 4.4 2.9 1.4 0.2 setosa
5: 4.5 2.3 1.3 0.3 setosa
---
146: 7.7 3.8 6.7 2.2 virginica
147: 7.7 3.0 6.1 2.3 virginica
148: 7.7 2.8 6.7 2.0 virginica
149: 7.7 2.6 6.9 2.3 virginica
150: 7.9 3.8 6.4 2.0 virginica
:= (仅限单个)> data.table(iris)[, Sepal.LWR := Sepal.Length/Sepal.Width]
Sepal.Length Sepal.Width Petal.Length Petal.Width Species Sepal.LWR
1: 4.3 3.0 1.1 0.1 setosa 1.433333
2: 4.4 3.2 1.3 0.2 setosa 1.375000
---
149: 7.7 2.6 6.9 2.3 virginica 2.961538
150: 7.9 3.8 6.4 2.0 virginica 2.078947
> data.table(iris)[, Sepal.LWR := Sepal.Length/Sepal.Width][
+ , Petal.LWR := Petal.Length/Petal.Width)]
Sepal.Length Sepal.Width Petal.Length Petal.Width Species Sepal.LWR Petal.LWR
1: 5.1 3.5 1.4 0.2 setosa 1.457143 7.000000
2: 4.9 3.0 1.4 0.2 setosa 1.633333 7.000000
---
149: 6.2 3.4 5.4 2.3 virginica 1.823529 2.347826
150: 5.9 3.0 5.1 1.8 virginica 1.966667 2.833333
melt和dcastlibrary(data.table)后调用,如为data.table类则直接调用改写后的函数> iris.new1 <- melt(data.table(iris), id.var='Species') > head(iris.new1, 4) Species variable value 1: setosa Sepal.Length 4.3 2: setosa Sepal.Length 4.4 3: setosa Sepal.Length 4.4 4: setosa Sepal.Length 4.4
> iris.cast1 <- dcast(iris.new1, Species~variable, mean)
> iris.cast1
Species Sepal.Length Sepal.Width Petal.Length Petal.Width
1: setosa 5.006 3.428 1.462 0.246
2: versicolor 5.936 2.770 4.260 1.326
3: virginica 6.588 2.974 5.552 2.026
j整合计算.N> data.table(iris)[, .N, by=Species]
Species N
1: setosa 50
2: versicolor 50
3: virginica 50
.SD: 代表按by分拆的data.table们(.SDcols指定分组变量)> data.table(iris)[, lapply(.SD, mean), by=Species, .SDcols=c("Sepal.Width", "Petal.Width")]
Species Sepal.Width Petal.Width
1: setosa 3.428 0.246
2: versicolor 2.770 1.326
3: virginica 2.974 2.026
选取iris数据集Sepal.Length > 6的记录,构造一个指标SPWR (Sepal.Width与Petal.Width之比)。 按Species分组,分析SPWR的均数及标准差。
> d <- data.table(iris)
> d[Sepal.Length > 6, SPWR := Sepal.Width / Petal.Width
+ ][, list(m = mean(SPWR, na.rm=TRUE),
+ sd = sd(SPWR, na.rm=TRUE)), by=Species]
Species m sd
1: setosa NaN NA
2: versicolor 2.044987 0.2271170
3: virginica 1.494320 0.1874827
Thank you!