2017-01-15 14:04:35
nchar()
> nchar(c("Hello world!", "Goodbye guys!")) > [1] 12 13
toupper()
, tolower()
, chartr()
> tolower('AgCTaaGGGcctTagct') # 转小写 [1] "agctaagggccttagct" > toupper("AgCTaaGGGcctTagct") # 转大写 [1] "AGCTAAGGGCCTTAGCT" > chartr("Tt", "Uu", 'AgCTaaGGGcctTagct') # 批量替换 [1] "AgCUaaGGGccuUagcu"
paste
/paste0
> paste(c("A", "B"), 1:2, sep="_") # sep连接向量 [1] "A_1" "B_2" > paste(c("A", "B"), 1:2, collapse="_") # 先sep连接向量,再collapse连接为标量 [1] "A 1_B 2" > paste0(c("A", "B"), 1:2) # 等价于paste(..., sep="") [1] "A1" "B2"
strsplit
> strsplit("Hello\nworld!", split="\n") [[1]] [1] "Hello" "world!" > strsplit("Hello", split="") # 单字符拆分 [[1]] [1] "H" "e" "l" "l" "o"
substr
/substring
substr(x, start, stop)
substring(text, first, last = 1000000L)
> substr("01234567", 2, 4) [1] "123" > substring("01234567", c(2, 4), c(4, 6)) # 等价于subtr(rep("01234567", 2), c(2, 4), c(4, 6)) [1] "123" "345" > substring("01234567", seq(1, 7, by=2), seq(2, 8, by=2)) # 等价于substring("01234567", c(1, 3, 5, 7), c(2, 4, 6, 8)) # 也等价于substr(rep("01234567", 4), seq(1, 7, by=2), seq(2, 8, by=2)) [1] "01" "23" "45" "67"
grep
家族grep是UNIX下的模式识别库,基于正则表达式
返回位置的函数: grep
, regexpr
> grep("a", c("abca", "tbbt")) # 返回第一个查找结果 [1] 1 > regexpr("a", c("abca", "tbbt")) [1] 1 -1 attr(,"match.length") [1] 1 -1 attr(,"useBytes") [1] TRUE
grepl
> grepl("a", c("abca", "tbbt")) [1] TRUE FALSE
grep
家族 (续)gregexpr
, regexec
> gregexpr("a", c("abca", "tbbt")) # 返回全部查找结果 [[1]] [1] 1 4 attr(,"match.length") [1] 1 1 attr(,"useBytes") [1] TRUE [[2]] [1] -1 attr(,"match.length") [1] -1 attr(,"useBytes") [1] TRUE
> regexec("a", c("abca", "tbbt")) # 返回第一个查找结果 [[1]] [1] 1 attr(,"match.length") [1] 1 attr(,"useBytes") [1] TRUE [[2]] [1] -1 attr(,"match.length") [1] -1 attr(,"useBytes") [1] TRUE
grep
家族sub
, gsub
sub
> sub("a", "x", c("abca", "tbbt")) [1] "xbca" "tbbt"
gsub
> gsub("a", "x", c("abca", "tbbt")) [1] "xbcx" "tbbt"
replace
或substitute
> replace(1:5, c(3,5), 'a') [1] "1" "2" "a" "4" "a" > subsitute(y~x) y ~ x
strtrim
: 定制输出宽度> strtrim(rep("abcde", 3), c(1, 5, 10)) [1] "a" "abcde" "abcde"
trimws
: 去空格> trimws(" abcd ") [1] "abcd"
strwrap
: 缩进和宽度> strwrap(stringi::stri_rand_lipsum(1), width=40, exdent=4) [1] "Lorem ipsum dolor sit amet, in ornare" [2] " vehicula proin lorem duis platea" [3] " aliquam ridiculus tortor. Tellus" [4] " conubia nibh elementum, quam lectus" [5] " odio duis eleifend. Sed dictumst" [6] " morbi laoreet dignissim, sapien" ...
grep
家族本质上是基于正则表达式工作的?regex
可了解更多,基本组成
> grep("2", c("way", "2", "go")) [1] 2
> grep("\\.", c("4.2", "1", "0.4")) [1] 1 3
> grep("[[:punct:]]", c("32", "a", "5-6"), perl=TRUE) [1] 3
> grep("\\w", c("w1", " ", 23)) [1] 1 3 > grep("[[:alnum:]]", c("w1", " ", 23), perl=TRUE) [1] 1 3 > grep("[[:lower:]]", c("w1", " ", 23), perl=TRUE) [1] 1 > grep("\\d", c("w1", " ", 23)) [1] 1 3 > grep("[[:digit:]]", c("w1", " ", 23)) [1] 1 3 > grep("\\s", c("w1", " ", 23)) [1] 2 > grep("[[:blank:]]", c("w1", " ", 23)) [1] 2
.
,含([]
)和不含([^]
)> string <- c("Hello ", " world", " hello", " piano", " cello.") > grep("[Hh]ello", string) [1] 1 3
> grep("[^Hh]ello", string) [1] 5
> string <- c("Grrr", "small", "Grrrrrr", "big") > grep("s.a", string) [1] 2
^
)、行尾($
)> string <- c("Hello ", " world", " hello", " piano", " cello.") > grep("^[Hh]ello", string) [1] 1
> string <- c("Hello ", " world", " hello", " piano", " cello.") > grep("[Hh]ello$", string) [1] 3
> string <- c("Grrr", "small", "Grrrrrr", "big") > grep("r?", string) [1] 1 2 3 4
> grep("r*", string) [1] 1 2 3 4
> grep("r+", string) [1] 1 3
> grep("r{4,6}", string) [1] 3
> grep("r{5,}", string) [1] 3
sub
或gsub
> sub("[Hh]ello", "Hi", "Hello world, hello everyone.") [1] "Hi world, hello everyone." > gsub("[Hh]ello", "hi", "Hello world, hello everyone.") [1] "hi world, hi everyone."
> gsub("[Hh]ello (\\w+)[[:punct:][:blank:]]", "\\1", + "Hello world, hello everyone.", perl=TRUE) [1] "world everyone"
|
> grep("(H|h|c)ello", c("Hello ", " world", " hello", " piano", " cello.")) [1] 1 3 5
> gsub("[Hh]ello (.+)", "\\1", "Hello world, hello everyone.") [1] "world, hello everyone."
> gsub("[Hh]ello (.+?)", "\\1", "Hello world, hello everyone.") [1] "world, everyone."
format
format
函数可将非文本格式化为文本> format(c('aa', 'b', 'ccc'), justify='right') [1] " aa" " b" "ccc" > format(c('aa', 'b', 'ccc'), justify='right', width=8) [1] " aa" " b" " ccc" > format(pi, digits=5) [1] "3.1416" > format(pi, scientific = TRUE) [1] "3.141593e+00" > format(1234567890, big.mark=',') [1] "1,234,567,890"
sprintf
sprintf
是C语言的格式化库,直观便捷fmt
模板包括%d
(整型), %f
( 固定格式), %e
(指数格式), %g
(双精度), %s
(文本)等, 模板标志前可加%m.n
(整数.小数), 空格, 0, #等辅助符号%1$
, %2$
定义模板索引号,通过*1$
, *2$
引用> sprintf("%d月销售额为¥%0.0f万元,占全年的%.1f%%。", 12, 40, 100*400/2000) [1] "12月销售额为¥40万元,占全年的20.0%。" > sprintf("%5.2e", 1234567890) [1] "1.23e+09" > sprintf("圆周率保留%1$d位小数,结果为%2$.*1$f", 1:4, pi) [1] "圆周率保留1位小数,结果为3.1" "圆周率保留2位小数,结果为3.14" [3] "圆周率保留3位小数,结果为3.142" "圆周率保留4位小数,结果为3.1416"
sessionInfo()
或Sys.getlocale("LC_CTYPE")
: "English_United States.1252"Sys.setlocale("LC_CTYPE", "chs")
将R的环境转为简体中文encoding
或filEncoding
指定正确编码stringr::str_conv()
函数转化stringi
开发的字符处理工具包str_c
/str_join
, str_sub
, …str_trim
, str_wrap
, …str_count
, str_length
, …str_split
, str_subset
, str_match
, …str_conv
, str_to_upper
, …boundary
, coll
, regex
, …str_c(..., sep = "", collapse = NULL)
: 默认不带空格拼接,NA保持NA> library(stringr) > str_c(letters[1:2], letters[3:4], sep="-") [1] "a-c" "b-d" > str_c(letters[1:2], letters[3:4], collapse="") [1] "abcd"
str_sub(string, start = 1L, end = -1L)
: 同substring
> str_sub('abcdefghijklm', c(1, 4), c(3, 6)) [1] "abc" "def" > str_sub('abcdefghijklm', c(-3, -6), c(-1, -4)) [1] "klm" "hij"
str_trim(string, side = c("both", "left", "right"))
: 去两端空格和\t> str_trim(' blabla\tblabla ') [1] "blabla\tblabla"
str_dup(string, times)
: 重复字符> str_dup(letters[1:3], 2) [1] "aa" "bb" "cc"
str_wrap(string, width = 80, indent = 0, exdent = 0)
: 格式输出> cat(str_wrap('abcdefghijklm abcdefghijklm', width=5, exdent=5)) abcdefghijklm abcdefghijklm
str_count(string, pattern = "")
: 字数匹配统计> str_count(c("a.", ".", ".a.",NA), ".") [1] 2 1 3 NA > str_count(c("a.", ".", ".a.",NA), "\\.") [1] 1 1 2 NA
str_length(string)
: 字数统计> str_length(c("a.", ".", ".a.",NA)) # 等价于str_count(..., pattern="") [1] 2 1 3 NA
str_sort(x, decreasing = FALSE, na_last = TRUE, locale = "", ...)
: 排序返值str_order(x, decreasing = FALSE, na_last = TRUE, locale = "", ...)
: 排序返序号> str_sort(c('a', 1, '我', 2, '11'), locale="en") [1] "1" "11" "2" "a" "我" > str_sort(c('a', 1, '我', 2, '11'), locale="zh") [1] "1" "11" "2" "我" "a" > str_order(c('a', 1, '我', 2, '11'), TRUE, locale="zh") [1] 1 3 4 5 2
str_split(string, pattern, n = Inf)
: 匹配切割返回列表str_split_fixed(string, pattern, n)
: 匹配切割返回矩阵> str_split("aaa,bbb;ccc.ddd", pattern="[[:punct:]]", n=3) [[1]] [1] "aaa" "bbb" "ccc.ddd"
str_subset(string, pattern)
: 返回匹配字符串> str_subset(c("ab", "dc", "ac"), "a") [1] "ab" "ac"
str_match(string, pattern)
: 匹配提取str_match_all(string, pattern)
: 匹配提取,返回矩阵> str_match(c("ab", "dc", "ac"), "a") [,1] [1,] "a" [2,] NA [3,] "a"
str_detect(string, pattern)
: 匹配与否> str_detect(c("ab", "dc", "ac"), "a") [1] TRUE FALSE TRUE
word(string, start = 1L, end = start, sep = fixed(" "))
: 提取单词> word("aaa,bbb;ccc.ddd", 1:2, sep="[[:punct:]]") [1] "aaa" "bbb"
str_locate(string, pattern)
: 定位,返回首个str_locate_all(string, pattern)
: 定位,返回矩阵> str_locate(c("ab", "dc", "ac"), "a") start end [1,] 1 1 [2,] NA NA [3,] 1 1
str_replace(string, pattern, replacement)
: 模式替换> str_replace(c("ab", "dc", "ac"), "a", "A") [1] "Ab" "dc" "Ac"
str_extract(string, pattern)
: 提取模式,返回首个str_extract_all(string, pattern, simplify = FALSE)
: 提取模式,返回全部> str_extract(c("ab13", "d2c", "a_c"), "\\d") [1] "1" "2" NA
str_conv(string, encoding)
: 转编码> str_conv("\u4e0a\u6d77", "UTF-8") # Unicode ==> UTF-8 [1] "上海"
str_to_upper(string, locale = "")
: 转大写str_to_lower(string, locale = "")
: 转小写str_to_title(string, locale = "")
: 首字母大写> str_to_title(c("adb", "edb", "db de")) [1] "Adb" "Edb" "Db De"
Thank you!