2017-01-15 14:04:35
nchar()> nchar(c("Hello world!", "Goodbye guys!"))
> [1] 12 13
toupper(), tolower(), chartr()> tolower('AgCTaaGGGcctTagct') # 转小写
[1] "agctaagggccttagct"
> toupper("AgCTaaGGGcctTagct") # 转大写
[1] "AGCTAAGGGCCTTAGCT"
> chartr("Tt", "Uu", 'AgCTaaGGGcctTagct') # 批量替换
[1] "AgCUaaGGGccuUagcu"
paste/paste0> paste(c("A", "B"), 1:2, sep="_") # sep连接向量
[1] "A_1" "B_2"
> paste(c("A", "B"), 1:2, collapse="_")
# 先sep连接向量,再collapse连接为标量
[1] "A 1_B 2"
> paste0(c("A", "B"), 1:2) # 等价于paste(..., sep="")
[1] "A1" "B2"
strsplit> strsplit("Hello\nworld!", split="\n")
[[1]]
[1] "Hello" "world!"
> strsplit("Hello", split="") # 单字符拆分
[[1]]
[1] "H" "e" "l" "l" "o"
substr/substringsubstr(x, start, stop)substring(text, first, last = 1000000L)> substr("01234567", 2, 4)
[1] "123"
> substring("01234567", c(2, 4), c(4, 6))
# 等价于subtr(rep("01234567", 2), c(2, 4), c(4, 6))
[1] "123" "345"
> substring("01234567", seq(1, 7, by=2), seq(2, 8, by=2))
# 等价于substring("01234567", c(1, 3, 5, 7), c(2, 4, 6, 8))
# 也等价于substr(rep("01234567", 4), seq(1, 7, by=2), seq(2, 8, by=2))
[1] "01" "23" "45" "67"
grep家族grep是UNIX下的模式识别库,基于正则表达式
返回位置的函数: grep, regexpr
> grep("a", c("abca", "tbbt")) # 返回第一个查找结果
[1] 1
> regexpr("a", c("abca", "tbbt"))
[1] 1 -1
attr(,"match.length")
[1] 1 -1
attr(,"useBytes")
[1] TRUE
grepl> grepl("a", c("abca", "tbbt"))
[1] TRUE FALSE
grep家族 (续)gregexpr, regexec> gregexpr("a", c("abca", "tbbt")) # 返回全部查找结果
[[1]]
[1] 1 4
attr(,"match.length")
[1] 1 1
attr(,"useBytes")
[1] TRUE
[[2]]
[1] -1
attr(,"match.length")
[1] -1
attr(,"useBytes")
[1] TRUE
> regexec("a", c("abca", "tbbt")) # 返回第一个查找结果
[[1]]
[1] 1
attr(,"match.length")
[1] 1
attr(,"useBytes")
[1] TRUE
[[2]]
[1] -1
attr(,"match.length")
[1] -1
attr(,"useBytes")
[1] TRUEgrep家族sub, gsubsub> sub("a", "x", c("abca", "tbbt"))
[1] "xbca" "tbbt"
gsub> gsub("a", "x", c("abca", "tbbt"))
[1] "xbcx" "tbbt"
replace或substitute> replace(1:5, c(3,5), 'a') [1] "1" "2" "a" "4" "a" > subsitute(y~x) y ~ x
strtrim: 定制输出宽度> strtrim(rep("abcde", 3), c(1, 5, 10))
[1] "a" "abcde" "abcde"
trimws: 去空格> trimws(" abcd ")
[1] "abcd"
strwrap: 缩进和宽度> strwrap(stringi::stri_rand_lipsum(1), width=40, exdent=4) [1] "Lorem ipsum dolor sit amet, in ornare" [2] " vehicula proin lorem duis platea" [3] " aliquam ridiculus tortor. Tellus" [4] " conubia nibh elementum, quam lectus" [5] " odio duis eleifend. Sed dictumst" [6] " morbi laoreet dignissim, sapien" ...
grep家族本质上是基于正则表达式工作的?regex可了解更多,基本组成
> grep("2", c("way", "2", "go"))
[1] 2
> grep("\\.", c("4.2", "1", "0.4"))
[1] 1 3
> grep("[[:punct:]]", c("32", "a", "5-6"), perl=TRUE)
[1] 3
> grep("\\w", c("w1", " ", 23))
[1] 1 3
> grep("[[:alnum:]]", c("w1", " ", 23), perl=TRUE)
[1] 1 3
> grep("[[:lower:]]", c("w1", " ", 23), perl=TRUE)
[1] 1
> grep("\\d", c("w1", " ", 23))
[1] 1 3
> grep("[[:digit:]]", c("w1", " ", 23))
[1] 1 3
> grep("\\s", c("w1", " ", 23))
[1] 2
> grep("[[:blank:]]", c("w1", " ", 23))
[1] 2
.,含([])和不含([^])> string <- c("Hello ", " world", " hello", " piano", " cello.")
> grep("[Hh]ello", string)
[1] 1 3
> grep("[^Hh]ello", string)
[1] 5
> string <- c("Grrr", "small", "Grrrrrr", "big")
> grep("s.a", string)
[1] 2
^)、行尾($)> string <- c("Hello ", " world", " hello", " piano", " cello.")
> grep("^[Hh]ello", string)
[1] 1
> string <- c("Hello ", " world", " hello", " piano", " cello.")
> grep("[Hh]ello$", string)
[1] 3
> string <- c("Grrr", "small", "Grrrrrr", "big")
> grep("r?", string)
[1] 1 2 3 4
> grep("r*", string)
[1] 1 2 3 4
> grep("r+", string)
[1] 1 3
> grep("r{4,6}", string)
[1] 3
> grep("r{5,}", string)
[1] 3
sub或gsub> sub("[Hh]ello", "Hi", "Hello world, hello everyone.")
[1] "Hi world, hello everyone."
> gsub("[Hh]ello", "hi", "Hello world, hello everyone.")
[1] "hi world, hi everyone."
> gsub("[Hh]ello (\\w+)[[:punct:][:blank:]]", "\\1",
+ "Hello world, hello everyone.", perl=TRUE)
[1] "world everyone"
|> grep("(H|h|c)ello", c("Hello ", " world", " hello", " piano", " cello."))
[1] 1 3 5
> gsub("[Hh]ello (.+)", "\\1", "Hello world, hello everyone.")
[1] "world, hello everyone."
> gsub("[Hh]ello (.+?)", "\\1", "Hello world, hello everyone.")
[1] "world, everyone."
formatformat函数可将非文本格式化为文本> format(c('aa', 'b', 'ccc'), justify='right')
[1] " aa" " b" "ccc"
> format(c('aa', 'b', 'ccc'), justify='right', width=8)
[1] " aa" " b" " ccc"
> format(pi, digits=5)
[1] "3.1416"
> format(pi, scientific = TRUE)
[1] "3.141593e+00"
> format(1234567890, big.mark=',')
[1] "1,234,567,890"
sprintfsprintf是C语言的格式化库,直观便捷fmt模板包括%d(整型), %f( 固定格式), %e(指数格式), %g(双精度), %s(文本)等, 模板标志前可加%m.n(整数.小数), 空格, 0, #等辅助符号%1$, %2$定义模板索引号,通过*1$, *2$引用> sprintf("%d月销售额为¥%0.0f万元,占全年的%.1f%%。", 12, 40, 100*400/2000)
[1] "12月销售额为¥40万元,占全年的20.0%。"
> sprintf("%5.2e", 1234567890)
[1] "1.23e+09"
> sprintf("圆周率保留%1$d位小数,结果为%2$.*1$f", 1:4, pi)
[1] "圆周率保留1位小数,结果为3.1" "圆周率保留2位小数,结果为3.14"
[3] "圆周率保留3位小数,结果为3.142" "圆周率保留4位小数,结果为3.1416"
sessionInfo()或Sys.getlocale("LC_CTYPE"): "English_United States.1252"Sys.setlocale("LC_CTYPE", "chs")将R的环境转为简体中文encoding或filEncoding指定正确编码stringr::str_conv()函数转化stringi开发的字符处理工具包str_c/str_join, str_sub, …str_trim, str_wrap, …str_count, str_length, …str_split, str_subset, str_match, …str_conv, str_to_upper, …boundary, coll, regex, …str_c(..., sep = "", collapse = NULL): 默认不带空格拼接,NA保持NA> library(stringr) > str_c(letters[1:2], letters[3:4], sep="-") [1] "a-c" "b-d" > str_c(letters[1:2], letters[3:4], collapse="") [1] "abcd"
str_sub(string, start = 1L, end = -1L): 同substring> str_sub('abcdefghijklm', c(1, 4), c(3, 6))
[1] "abc" "def"
> str_sub('abcdefghijklm', c(-3, -6), c(-1, -4))
[1] "klm" "hij"
str_trim(string, side = c("both", "left", "right")): 去两端空格和\t> str_trim(' blabla\tblabla ')
[1] "blabla\tblabla"
str_dup(string, times): 重复字符> str_dup(letters[1:3], 2) [1] "aa" "bb" "cc"
str_wrap(string, width = 80, indent = 0, exdent = 0): 格式输出> cat(str_wrap('abcdefghijklm abcdefghijklm', width=5, exdent=5))
abcdefghijklm
abcdefghijklm
str_count(string, pattern = ""): 字数匹配统计> str_count(c("a.", ".", ".a.",NA), ".")
[1] 2 1 3 NA
> str_count(c("a.", ".", ".a.",NA), "\\.")
[1] 1 1 2 NA
str_length(string): 字数统计> str_length(c("a.", ".", ".a.",NA)) # 等价于str_count(..., pattern="")
[1] 2 1 3 NA
str_sort(x, decreasing = FALSE, na_last = TRUE, locale = "", ...): 排序返值str_order(x, decreasing = FALSE, na_last = TRUE, locale = "", ...): 排序返序号> str_sort(c('a', 1, '我', 2, '11'), locale="en")
[1] "1" "11" "2" "a" "我"
> str_sort(c('a', 1, '我', 2, '11'), locale="zh")
[1] "1" "11" "2" "我" "a"
> str_order(c('a', 1, '我', 2, '11'), TRUE, locale="zh")
[1] 1 3 4 5 2
str_split(string, pattern, n = Inf): 匹配切割返回列表str_split_fixed(string, pattern, n): 匹配切割返回矩阵> str_split("aaa,bbb;ccc.ddd", pattern="[[:punct:]]", n=3)
[[1]]
[1] "aaa" "bbb" "ccc.ddd"
str_subset(string, pattern): 返回匹配字符串> str_subset(c("ab", "dc", "ac"), "a")
[1] "ab" "ac"
str_match(string, pattern): 匹配提取str_match_all(string, pattern): 匹配提取,返回矩阵> str_match(c("ab", "dc", "ac"), "a")
[,1]
[1,] "a"
[2,] NA
[3,] "a"
str_detect(string, pattern): 匹配与否> str_detect(c("ab", "dc", "ac"), "a")
[1] TRUE FALSE TRUE
word(string, start = 1L, end = start, sep = fixed(" ")): 提取单词> word("aaa,bbb;ccc.ddd", 1:2, sep="[[:punct:]]")
[1] "aaa" "bbb"
str_locate(string, pattern): 定位,返回首个str_locate_all(string, pattern): 定位,返回矩阵> str_locate(c("ab", "dc", "ac"), "a")
start end
[1,] 1 1
[2,] NA NA
[3,] 1 1
str_replace(string, pattern, replacement): 模式替换> str_replace(c("ab", "dc", "ac"), "a", "A")
[1] "Ab" "dc" "Ac"
str_extract(string, pattern): 提取模式,返回首个str_extract_all(string, pattern, simplify = FALSE): 提取模式,返回全部> str_extract(c("ab13", "d2c", "a_c"), "\\d")
[1] "1" "2" NA
str_conv(string, encoding): 转编码> str_conv("\u4e0a\u6d77", "UTF-8") # Unicode ==> UTF-8
[1] "上海"
str_to_upper(string, locale = ""): 转大写str_to_lower(string, locale = ""): 转小写str_to_title(string, locale = ""): 首字母大写> str_to_title(c("adb", "edb", "db de"))
[1] "Adb" "Edb" "Db De"
Thank you!