一、R包stringr

我觉得这个包比自带的函数处理要简单好用

  • 字符操作:操作字符向量中的单个字符
1
2
3
4
5
6
str_length('abcdefgh') #返回字符长度
[1] 8
str_sub('abcdefgh',2,4) #返回切割的字符向量
[1] "bcd"
str_dup('abcdefgh',2) #重复拼接字符
[1] "abcdefghabcdefgh"
  • 添加,移除和操作空白符
1
2
3
4
5
6
7
8
9
10
str_pad(string, width, side = c("left", "right", "both"), pad = " ") #添加空白符
str_pad("string", 10, side = "left", pad = " ")
[1] " string"

str_trim(string, side = c("both", "left", "right")) #删除空白符
str_trim(' abc',side='both')
[1] "abc"

#段落操作,行宽,每段第一行缩进,每段除第一行缩进
str_wrap(string, width = 80, indent = 0, exdent = 0)
  • 大小写转换处理
1
2
3
4
5
6
7
8
# 转大写
str_to_upper(string, locale = "en")
# 转小写
str_to_lower(string, locale = "en")
# 首字母大写
str_to_title(string, locale = "en")
# 句子首字母大写
str_to_sentence(string, locale = "en")
  • 模式匹配函数

    参数形式都大致一样

1
2
(string,pattern) #string是文本,pattern是匹配模式,可以写正则表达式
fruit <- c("apple", "banana", "pear", "pinapple")
1
2
> str_detect(fruit, "a") #返回布尔值
[1] TRUE TRUE TRUE TRUE
1
2
3
4
> str_subset(fruit, "a") #返回字符串
[1] "apple" "banana" "pear" "pinapple"
> str_subset(fruit, "^a")
[1] "apple"
1
2
> str_count(fruit, "a") #返回出现次数
[1] 1 3 1 1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
> str_locate(fruit, "a") #返回第一个字符的位置
start end
[1,] 1 1
[2,] 2 2
[3,] 3 3
[4,] 4 4
> str_locate_all(fruit, "a") #返回所有匹配字符的位置
[[1]]
start end
[1,] 1 1

[[2]]
start end
[1,] 2 2
[2,] 4 4
[3,] 6 6

[[3]]
start end
[1,] 3 3

[[4]]
start end
[1,] 4 4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
str_match #从字符串中提取匹配组
str_match_all #字符串中提取匹配组

######
strings <- c(" 219 733 8965", "329-293-8753 ", "banana", "595 794 7569",
"387 287 6718", "apple", "233.398.9187 ", "482 952 3315",
"239 923 8115 and 842 566 4692", "Work: 579-499-7527", "$1000",
"Home: 543.355.3679")
phone <- "([2-9][0-9]{2})[- .]([0-9]{3})[- .]([0-9]{4})"

> str_match(strings, phone)
[,1] [,2] [,3] [,4]
[1,] "219 733 8965" "219" "733" "8965"
[2,] "329-293-8753" "329" "293" "8753"
[3,] NA NA NA NA
[4,] "595 794 7569" "595" "794" "7569"
[5,] "387 287 6718" "387" "287" "6718"
[6,] NA NA NA NA
[7,] "233.398.9187" "233" "398" "9187"
[8,] "482 952 3315" "482" "952" "3315"
[9,] "239 923 8115" "239" "923" "8115"
[10,] "579-499-7527" "579" "499" "7527"
[11,] NA NA NA NA
[12,] "543.355.3679" "543" "355" "3679"


> str_match_all(strings, phone)
[[1]]
[,1] [,2] [,3] [,4]
[1,] "219 733 8965" "219" "733" "8965"

[[2]]
[,1] [,2] [,3] [,4]
[1,] "329-293-8753" "329" "293" "8753"

[[3]]
[,1] [,2] [,3] [,4]

[[4]]
[,1] [,2] [,3] [,4]
[1,] "595 794 7569" "595" "794" "7569"

[[5]]
[,1] [,2] [,3] [,4]
[1,] "387 287 6718" "387" "287" "6718"

[[6]]
[,1] [,2] [,3] [,4]

[[7]]
[,1] [,2] [,3] [,4]
[1,] "233.398.9187" "233" "398" "9187"

[[8]]
[,1] [,2] [,3] [,4]
[1,] "482 952 3315" "482" "952" "3315"

[[9]]
[,1] [,2] [,3] [,4]
[1,] "239 923 8115" "239" "923" "8115"
[2,] "842 566 4692" "842" "566" "4692"

[[10]]
[,1] [,2] [,3] [,4]
[1,] "579-499-7527" "579" "499" "7527"

[[11]]
[,1] [,2] [,3] [,4]

[[12]]
[,1] [,2] [,3] [,4]
[1,] "543.355.3679" "543" "355" "3679"
1
2
3
4
5
6
7
8
9
str_replace(string, pattern, replacement) #替换第一个出现字符
str_replace_all(string, pattern, replacement) #替换所有的字符

######
fruits <- c("one apple", "two pears", "three bananas")
> str_replace(fruits, "[aeiou]", "-")
[1] "-ne apple" "tw- pears" "thr-e bananas"
> str_replace_all(fruits, "[aeiou]", "-")
[1] "-n- -ppl-" "tw- p--rs" "thr-- b-n-n-s"
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
str_split(string, pattern, n = Inf, simplify = FALSE) #字符串分割
str_split_fixed(string, pattern, n) #字符串分割

######
fruits <- c(
"apples and oranges and pears and bananas",
"pineapples and mangos and guavas"
)
> str_split(fruits, " and ")
[[1]]
[1] "apples" "oranges" "pears" "bananas"

[[2]]
[1] "pineapples" "mangos" "guavas"

str_split(fruits, " and ", simplify = TRUE)
[,1] [,2] [,3] [,4]
[1,] "apples" "oranges" "pears" "bananas"
[2,] "pineapples" "mangos" "guavas" ""

> str_split(fruits, " and ", n = 3)
[[1]]
[1] "apples" "oranges"
[3] "pears and bananas"

[[2]]
[1] "pineapples" "mangos" "guavas"

> str_split_fixed(fruits, " and ", 3)
[,1] [,2] [,3]
[1,] "apples" "oranges" "pears and bananas"
[2,] "pineapples" "mangos" "guavas"
1
2
3
4
5
6
7
8
9
10
11
12
str_extract(string, pattern) #从字符串中提取匹配字符
str_extract_all(string, pattern, simplify = FALSE)

> shopping_list <- c("apples x4", "bag of flour", "bag of sugar", "milk x2")
> str_extract(shopping_list, "\\d")
[1] "4" NA NA "2"
> str_extract_all(shopping_list, "\\b[a-z]+\\b", simplify = TRUE)
[,1] [,2] [,3]
[1,] "apples" "" ""
[2,] "bag" "of" "flour"
[3,] "bag" "of" "sugar"
[4,] "milk" "" ""

二、正则表达式

  • 特殊字符
字符 含义
[ ] 括号内的任意字符被匹配
\ 转义、某些特殊字符
^ 匹配输入字符串的开始位置
( ) 括号内所有字符作为整体被匹配
\ 二选一
. 匹配除换行符以外的任意字符
$ 匹配字符串的结束
  • 重复
字符 含义
{n} 匹配n次
{n,} 至少匹配n次
{n,m} 匹配n到m次
* 前面的字符匹配0或多
+ 前面的字符匹配1或多
? 前面的字符匹配0或1
  • 贴一张正则化图

三、python

  • 导入import re
  • a = re.compile()
  • a.search('text')
  • 菜鸟教程