一、R包stringr

我觉得这个包比自带的函数处理要简单好用

字符操作：操作字符向量中的单个字符

str_length('abcdefgh') #返回字符长度
[1] 8
str_sub('abcdefgh',2,4) #返回切割的字符向量
[1] "bcd"
str_dup('abcdefgh',2) #重复拼接字符
[1] "abcdefghabcdefgh"

添加，移除和操作空白符

str_pad(string, width, side = c("left", "right", "both"), pad = " ") #添加空白符
str_pad("string", 10, side = "left", pad = " ")
[1] "    string"

str_trim(string, side = c("both", "left", "right")) #删除空白符
str_trim('   abc',side='both')
[1] "abc"

#段落操作,行宽，每段第一行缩进，每段除第一行缩进
str_wrap(string, width = 80, indent = 0, exdent = 0)

大小写转换处理

# 转大写
str_to_upper(string, locale = "en")
# 转小写
str_to_lower(string, locale = "en")
# 首字母大写
str_to_title(string, locale = "en")
# 句子首字母大写
str_to_sentence(string, locale = "en")

模式匹配函数

参数形式都大致一样

1 2	(string,pattern) #string是文本，pattern是匹配模式，可以写正则表达式 fruit <- c("apple", "banana", "pear", "pinapple")

1 2	> str_detect(fruit, "a") #返回布尔值 [1] TRUE TRUE TRUE TRUE

> str_subset(fruit, "a") #返回字符串
[1] "apple"    "banana"   "pear"     "pinapple"
> str_subset(fruit, "^a")
[1] "apple"

1 2	> str_count(fruit, "a") #返回出现次数 [1] 1 3 1 1

> str_locate(fruit, "a") #返回第一个字符的位置
     start end
[1,]     1   1
[2,]     2   2
[3,]     3   3
[4,]     4   4
> str_locate_all(fruit, "a") #返回所有匹配字符的位置
[[1]]
     start end
[1,]     1   1

[[2]]
     start end
[1,]     2   2
[2,]     4   4
[3,]     6   6

[[3]]
     start end
[1,]     3   3

[[4]]
     start end
[1,]     4   4

str_match #从字符串中提取匹配组
str_match_all #字符串中提取匹配组

######
strings <- c(" 219 733 8965", "329-293-8753 ", "banana", "595 794 7569",
  "387 287 6718", "apple", "233.398.9187  ", "482 952 3315",
  "239 923 8115 and 842 566 4692", "Work: 579-499-7527", "$1000",
  "Home: 543.355.3679")
phone <- "([2-9][0-9]{2})[- .]([0-9]{3})[- .]([0-9]{4})"

> str_match(strings, phone)
      [,1]           [,2]  [,3]  [,4]  
 [1,] "219 733 8965" "219" "733" "8965"
 [2,] "329-293-8753" "329" "293" "8753"
 [3,] NA             NA    NA    NA    
 [4,] "595 794 7569" "595" "794" "7569"
 [5,] "387 287 6718" "387" "287" "6718"
 [6,] NA             NA    NA    NA    
 [7,] "233.398.9187" "233" "398" "9187"
 [8,] "482 952 3315" "482" "952" "3315"
 [9,] "239 923 8115" "239" "923" "8115"
[10,] "579-499-7527" "579" "499" "7527"
[11,] NA             NA    NA    NA    
[12,] "543.355.3679" "543" "355" "3679"


> str_match_all(strings, phone)
[[1]]
     [,1]           [,2]  [,3]  [,4]  
[1,] "219 733 8965" "219" "733" "8965"

[[2]]
     [,1]           [,2]  [,3]  [,4]  
[1,] "329-293-8753" "329" "293" "8753"

[[3]]
     [,1] [,2] [,3] [,4]

[[4]]
     [,1]           [,2]  [,3]  [,4]  
[1,] "595 794 7569" "595" "794" "7569"

[[5]]
     [,1]           [,2]  [,3]  [,4]  
[1,] "387 287 6718" "387" "287" "6718"

[[6]]
     [,1] [,2] [,3] [,4]

[[7]]
     [,1]           [,2]  [,3]  [,4]  
[1,] "233.398.9187" "233" "398" "9187"

[[8]]
     [,1]           [,2]  [,3]  [,4]  
[1,] "482 952 3315" "482" "952" "3315"

[[9]]
     [,1]           [,2]  [,3]  [,4]  
[1,] "239 923 8115" "239" "923" "8115"
[2,] "842 566 4692" "842" "566" "4692"

[[10]]
     [,1]           [,2]  [,3]  [,4]  
[1,] "579-499-7527" "579" "499" "7527"

[[11]]
     [,1] [,2] [,3] [,4]

[[12]]
     [,1]           [,2]  [,3]  [,4]  
[1,] "543.355.3679" "543" "355" "3679"

str_replace(string, pattern, replacement) #替换第一个出现字符
str_replace_all(string, pattern, replacement) #替换所有的字符

######
fruits <- c("one apple", "two pears", "three bananas")
> str_replace(fruits, "[aeiou]", "-")
[1] "-ne apple"     "tw- pears"     "thr-e bananas"
> str_replace_all(fruits, "[aeiou]", "-")
[1] "-n- -ppl-"     "tw- p--rs"     "thr-- b-n-n-s"

str_split(string, pattern, n = Inf, simplify = FALSE) #字符串分割
str_split_fixed(string, pattern, n) #字符串分割

######
fruits <- c(
  "apples and oranges and pears and bananas",
  "pineapples and mangos and guavas"
)
> str_split(fruits, " and ")
[[1]]
[1] "apples"  "oranges" "pears"   "bananas"

[[2]]
[1] "pineapples" "mangos"     "guavas"    

str_split(fruits, " and ", simplify = TRUE)
     [,1]         [,2]      [,3]     [,4]     
[1,] "apples"     "oranges" "pears"  "bananas"
[2,] "pineapples" "mangos"  "guavas" "" 

> str_split(fruits, " and ", n = 3)
[[1]]
[1] "apples"            "oranges"          
[3] "pears and bananas"

[[2]]
[1] "pineapples" "mangos"     "guavas"

> str_split_fixed(fruits, " and ", 3)
     [,1]         [,2]      [,3]               
[1,] "apples"     "oranges" "pears and bananas"
[2,] "pineapples" "mangos"  "guavas"

str_extract(string, pattern) #从字符串中提取匹配字符
str_extract_all(string, pattern, simplify = FALSE)

> shopping_list <- c("apples x4", "bag of flour", "bag of sugar", "milk x2")
> str_extract(shopping_list, "\\d")
[1] "4" NA  NA  "2"
> str_extract_all(shopping_list, "\\b[a-z]+\\b", simplify = TRUE)
     [,1]     [,2] [,3]   
[1,] "apples" ""   ""     
[2,] "bag"    "of" "flour"
[3,] "bag"    "of" "sugar"
[4,] "milk"   ""   ""

二、正则表达式

特殊字符

字符	含义
[ ]	括号内的任意字符被匹配
\	转义、某些特殊字符
^	匹配输入字符串的开始位置
( )	括号内所有字符作为整体被匹配
\		二选一
.	匹配除换行符以外的任意字符
$	匹配字符串的结束

重复

字符	含义
{n}	匹配n次
{n,}	至少匹配n次
{n,m}	匹配n到m次
*	前面的字符匹配0或多次
+	前面的字符匹配1或多次
?	前面的字符匹配0或1次

贴一张正则化图

三、python

导入import re
a = re.compile()
a.search('text')
菜鸟教程