网页的知识

  • 网页一般由三部分组成,分别是 HTML(超文本标记语言)、CSS(层叠样式表)和 JScript(活动脚本语言)
  • 打开网页,ctrl + U可以快速查看内容
  • 有时候ctrl+U没有用,是因为加了<script>,这个要么解JS、要么webdriver
  • 还有种方法,点F12

HTML

  • 主要看菜鸟教程
  • 这里特别要注意网页有没有加<iframe>,这里吃了大亏==
1
2
3
4
5
6
7
8
<html>..</html> 表示标记中间的元素是网页
<body>..</body> 表示用户可见的内容
<div>..</div> 表示框架
<p>..</p> 表示段落
<li>..</li>表示列表
<img>..</img>表示图片
<h1>..</h1>表示标题
<a href="">..</a>表示超链接

爬虫合法性

几乎每一个网站都有一个名为robots.txt 的文档

robots.txt百度百科

Sitemap是网站上有哪些可供抓取的网页

Python

  • 主要参考来源CSDN
1
2
3
import requests
res = requests.get(url, headers, )
BeautifulSoup
  • Beautiful Soup 解析网页,官网
1
2
from bs4 import BeautifulSoup
soup = BeautifulSoup(res.text,'html.parser')

R语言爬虫

1
2
3
4
5
library(xml2) #read_html
library(rvest) # html_nodes
web = read_html()
web.html = html_nodes(web, 'xxxx')
web.test = html_text(web.html)

webdriver爬虫

  • 这个是我2020年4月爬美国肺炎用的==,因为解决不了noscript的问题
  • 要记得匹配chrome webdriver和chrome的版本
  • Selenium基础
  • xpath可以点网页某个地方,鼠标右键检查,之后会出现网页的开发者模式,在开发者模式中点右键,查看xpath位置,就是可以定位了
  • 下面是实际操控备份的代码

爬取好大夫

1
2
3
4
5
6
7
8
9
User-agent: *
Disallow: /chaos/*
Disallow: /api/*
Disallow: /index/*
Disallow: /bingli/*
Disallow: /message/*
Disallow: /passport/*
Disallow: /senderr/*
Sitemap: http://www.haodf.com/sitemap.xml

https://blog.csdn.net/qq_27302597/article/details/79411808

1
import execjs

js解码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
def get_521_content(url):
req = requests.get(url, headers=headers, timeout=5)
print(req.status_code,req.text)
if req.status_code == 521:
cookies = dict(req.cookies.items())
print(cookies)
js_con = ''.join(re.findall('<script>(.*?)</script>', req.text))
if js_con:
__jsl_clearance = fixed_fun(js_con, url)
if __jsl_clearance:
key, value = __jsl_clearance.split('=')
cookies[key] = value
return cookies

# 执行js代码获取cookies 的__jsl_clearance的键值
def fixed_fun(js_con, url): # js_con 第一次请求获取的js内容

func_return = js_con.replace('eval(', 'return(')
print('第一次替换eval==》return后: ', func_return)
content = execjs.compile(func_return)
fn = js_con.split('=')[0].split(' ')[1]
evaled_func = content.call(fn)
print('第一次执行js代码后: ', evaled_func)
fn = evaled_func.split('=')[0].split(' ')[1] # 获取动态函数名
aa = evaled_func.split("<a href=\\'/\\'>") # 获取<a>标签的内容
aa = aa[1].split("</a>")[0] if len(aa) >= 2 else ''
mode_func = evaled_func. \
replace(
"setTimeout('location.href=location.pathname+location.search.replace(/[\\?|&]captcha-challenge/,\\'\\')',1500);document.cookie=",
'return'). \
replace(';if((function(){try{return !!window.addEventListener;}', ''). \
replace(
"}catch(e){return false;}})()){document.addEventListener('DOMContentLoaded'," + fn + ",false)}else{document.attachEvent('onreadystatechange'," + fn + ")",
''). \
replace(
"if((function(){try{return !!window.addEventListener;}catch(e){return false;}})()){document.addEventListener('DOMContentLoaded'," + fn + ",false)}else{document.attachEvent('onreadystatechange'," + fn + ")",
''). \
replace("return'__jsl_clearance", "var window={};return '__jsl_clearance"). \
replace(
"var " + fn + "=document.createElement('div');" + fn + ".innerHTML='<a href=\\'/\\'>" + aa + "</a>';" + fn + "=" + fn + ".firstChild.href",
"var " + fn + "='" + url + "'")
print('第二次替换后的js代码:', mode_func)
try:
content = execjs.compile(mode_func)
cookies = content.call(fn)
__jsl_clearance = cookies.split(';')[0]
print(__jsl_clearance)
return __jsl_clearance
except:
print('js执行错误:', mode_func)
return None

# 携带解密后的cookies第二次爬取详情页
def con_spider(cookies, url):
response = requests.get(url, headers=headers, cookies=cookies, timeout=5)
if response.status_code == 200:
response.encoding = 'utf-8'
print(response.status_code)
#print(response.text)
return response
else:
print('第二次爬取错误状态码:', response.status_code)
return None

cookies = get_521_content(url)
con_spider(cookies, url)
  • 获取医院
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
"""
author:cfl
获取医院列表.
"""
import requests
from bs4 import BeautifulSoup
import re
import csv


headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/78.0.3904.108 Safari/537.36'}

# 获取全部省
def get_all_province(url):
# 获取页面内容
res = requests.get(url, headers=headers)
html = res.text
html = BeautifulSoup(html, 'lxml')

# 获取全部省
div_province = html.findAll('div', attrs={'class': 'ct'})[0]
div_province_list = div_province.findAll('div', attrs={'class': re.compile(r'kstl')})
province_name_list = []
province_href_list = []
for j in div_province_list:
name = re.findall(r'>(.+?)<', str(j))[0]
href = re.findall(r'yiyuan/(.+?)/', str(j))[0]
province_name_list.append(name)
province_href_list.append(href)

return province_name_list, province_href_list


# 获取医院
def get_hospital_id(pathname,
province_name, province_href):
url = 'https://www.haodf.com/yiyuan/%s/list.htm' % province_href

# 获取页面内容
res = requests.get(url, headers=headers)
html = res.text
html = BeautifulSoup(html, 'lxml')

# 当前省的全部城市
div_city = html.findAll('div', attrs={'class': 'ct'})[1]

# 全部城市名
div_city_name_list = div_city.findAll('div', attrs={'class': 'm_title_green'})
city_name_list = []
for j in div_city_name_list:
city_name = re.findall(r'>(.+?)<', str(j))[0]
city_name_list.append(city_name)

# 按区域获取医院
hospital_address_list = []
hospital_name_list = []
hospital_href_list = []
for j in range(len(city_name_list)):
# 区域
div_district = div_city.findAll('div', attrs={'class': 'm_ctt_green'})[j]
a_list = div_district.findAll('a', attrs={'href': re.compile(r'hospital')})
for k in a_list:
name = re.findall(r'>(.+?)<', str(k))[0]
href = re.findall(r'hospital/(.+?)\.htm', str(k))[0]
hospital_address_list.append(province_name + city_name_list[j])
hospital_name_list.append(name)
hospital_href_list.append(href)
# 写入医院数据
write_doc(pathname,hospital_address_list, hospital_href_list, hospital_name_list)

# 'hospital_address', 'hospital_href', 'hospital_name'
def write_doc(pathname,
hospital_address_list,
hospital_href_list,
hospital_name_list):
# 医院
with open(pathname + 'hospital.csv', 'a') as hospital:
# 写入行数据
for j in range(len(hospital_name_list)):
hospital.write('\n'+','.join([hospital_address_list[j],hospital_href_list[j],hospital_name_list[j]]))


if __name__ == '__main__':
my_pathname = '../data/'
with open(my_pathname + 'hospital.csv', 'w') as hospital:
hospital.write(','.join(['address','href','name']))
# 全部医院列表
my_url = 'https://www.haodf.com/yiyuan/all/list.htm'
# 省名称、链接
my_province_name_list, my_province_href_list = get_all_province(my_url)
for i in range(len(my_province_name_list)):
get_hospital_id(my_pathname, my_province_name_list[i], my_province_href_list[i])
  • 获取科室
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
"""
author: cfl
获取科室列表
"""
import requests
from bs4 import BeautifulSoup
import re
import csv
import time
import threading
import os
from random import randint


headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/78.0.3904.108 Safari/537.36'}


def read_doc(pathname):
# 全列表
hospital_all_href_list = []
hospital_all_name_list = []
hospital_all_reader_list = csv.reader(open(pathname + 'hospital.csv', 'r'))
# 删掉第一行
n = 0
for j in hospital_all_reader_list:
if n != 0:
hospital_all_href_list.append(j[1])
hospital_all_name_list.append(j[2])
n += 1

# 已完成列表
try:
office_finish_href_list = []
office_finish_reader_list = csv.reader(open(pathname + 'office_finish.csv', 'r'))
for j in office_finish_reader_list:
office_finish_href_list.append(j[0])
office_finish_href_set = set(office_finish_href_list)
except:
office_finish_href_list = []
office_finish_href_set = set(office_finish_href_list)

# 待爬取列表
hospital_href_list = []
hospital_name_list = []

for j in range(len(hospital_all_href_list)):
if hospital_all_href_list[j] not in office_finish_href_set:
hospital_href_list.append(hospital_all_href_list[j])
hospital_name_list.append(hospital_all_name_list[j])
return hospital_href_list,hospital_name_list


def get_office(pathname, href, Name):
url = 'https://www.haodf.com/hospital/%s/daifu.htm' % href
# 获取页面内容
res = requests.get(url, headers=headers)
html = res.text
html = BeautifulSoup(html, 'lxml')

# 科室类型
div_office_type = html.findAll('div', attrs={'class': 'intro_doc-nav-mod'})[0]

# 全部科室类型名
div_office_type_name_list = div_office_type.findAll('div', attrs={'class': 'de_title-mod'})
office_type_name_list = []
for j in div_office_type_name_list:
office_type_name = re.findall(r'>(.+?)<', str(j))[0]
office_type_name_list.append(office_type_name)

# 按科室类型获取科室
for j in range(len(office_type_name_list)):
# 科室
ul_office = div_office_type.findAll('ul', attrs={'class': 'de_content-mod'})[j]
a_list = ul_office.findAll('a', attrs={'href': re.compile(r'daifu')})

office_href_list = []
office_name_list = []
for k in a_list:
office_href = re.findall(r'hospital/.+?/(.+?)/daifu\.htm', str(k))[0]
name = re.findall(r'>(.+?)<', str(k))[0]
office_href_list.append(office_href)
office_name_list.append(Name + '_' + office_type_name_list[j] + '_' + name)
# 写入医院数据
write_doc(pathname,
href, office_href_list, office_name_list)


def write_table(pathname):
with open(pathname + 'office.csv', 'w') as office:
office.write(','.join(['address', 'office_href', 'office_name']))

# 'href', 'office'
def write_doc(pathname,
href, office_href_list, office_name_list):
# 医院细节信息
with open(pathname + 'office.csv', 'a') as office:
# 写入行数据
for j in range(len(office_href_list)):
office.write('\n'+','.join([href, office_href_list[j], office_name_list[j]]))


my_pathname = '../data/'
# 写入表头
write_table(my_pathname)
# 获取待爬取列表
href,name = read_doc(my_pathname)
error = []
# 记录一家医院,协和医院210
for i in range(len(href)):
time.sleep(randint(2, 5))
try:
get_office(my_pathname, href[i], name[i])
except:
error = error.append(i)
  • 爬取医生信息
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
"""
author: cfl
获取医生列表
"""
import requests
from bs4 import BeautifulSoup
import re
import csv
import time
import threading
import os
from random import randint

os.chdir('D:/华统/爬虫好大夫')

headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/78.0.3904.108 Safari/537.36'}

def read_doc(pathname):
# 全列表
hospital_all_href_list = []
office_all_href_list = []
hospital_office_all_reader_list = csv.reader(open(pathname + 'office.csv', 'r'))
# 删掉第一行
n = 0
for j in hospital_office_all_reader_list:
if n != 0:
hospital_all_href_list.append(j[0])
office_all_href_list.append(j[1])
n += 1

# 已完成列表
try:
office_finish_href_list = []
office_finish_reader_list = csv.reader(open(pathname + 'doctor_finish.csv', 'r'))
for j in office_finish_reader_list:
office_finish_href_list.append(j[0])
office_finish_href_set = set(office_finish_href_list)
except:
office_finish_href_list = []
office_finish_href_set = set(office_finish_href_list)

# 待爬取列表
hospital_href_list = []
office_href_list = []
for j in range(len(office_all_href_list)):
if office_all_href_list[j] not in office_finish_href_set:
hospital_href_list.append(hospital_all_href_list[j])
office_href_list.append(office_all_href_list[j])

return hospital_href_list, office_href_list


def get_doctor(pathname, hospital_href, office_href):
stop = False
pageNo = 0
while not stop:
pageNo += 1
print(pageNo)
url = 'https://www.haodf.com/hospital/%s/%s/daifu.htm?p=%s' % (hospital_href, office_href, str(pageNo))
res = requests.get(url, headers=headers)
html = res.text
html = BeautifulSoup(html, 'lxml')
# 医生列表
try:
table_doctor = html.findAll('table', attrs={'id': 'doc_list_index'})[0]

# 全部医生
td_doctor_list = table_doctor.findAll('td', attrs={'class': 'tda'})
doctor_href_list = []
doctor_home_list = []
doctor_name_list = []
for k in td_doctor_list:
doctor_href = re.findall(r'href="//(.+?)/"', str(k))[0]
doctor_home = re.findall(r'//(.+?)\.haodf\.com', str(k))[1]
doctor_name = re.findall(r'">(.+?)</a>', str(k))[0]
doctor_href_list.append(doctor_href)
doctor_home_list.append(doctor_home)
doctor_name_list.append(doctor_name)
write_doc(pathname,
office_href, doctor_href_list, doctor_home_list, doctor_name_list)
except:
break

def write_table(pathname):
with open(pathname + 'doctor.csv', 'w') as doctor:
doctor.write(','.join(['office_href', 'doctor_href', 'doctor_room', 'doctor_name']))


def write_doc(pathname,
office_href,
doctor_href_list,
doctor_room_list,
doctor_name_list):
with open(pathname + 'doctor.csv', 'a') as doctor:
for j in range(len(doctor_href_list)):
doctor.write('\n' + ','.join([office_href, doctor_href_list[j], doctor_room_list[j], doctor_name_list[j]]))

my_pathname = '../data/'

# 写入表头
write_table(my_pathname)
# 获取待爬取列表
hospital_href_list, office_href_list = read_doc(my_pathname)

for i in range(len(hospital_href_list)):
get_doctor(my_pathname, hospital_href_list[i], office_href_list[i])

R语言爬虫备份

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
library(xml2) #read_html
library(rvest) # html_nodes
library(dplyr)
library(stringr)
library(rjson)

# Sys.sleep(5) # 系统沉睡5秒,防止过度占比服务器

web = read_html('http://ncov.nosensor.com:8080/api/', encoding = 'UTF-8')

web.html = html_text(web)

text = fromJSON(web.html)

city = text$city # 这里取每个市


length(city) # 有多少天

n = length(city[[24]]$City) # 统计的市
dat = data.frame('省' = rep(NA, n),
'市' = rep(NA, n),
'确诊' = rep(NA, n),
'死亡' = rep(NA, n),
'治愈' = rep(NA, n),
'病重' = rep(NA, n),
'Critical' = rep(NA, n),
'日期' = city[[24]]$Time)

temp = city[[24]]$CityDetail

for (j in 1:length(temp)) {
dat[j, 1:7] = unlist( temp[[j]] )
}


for (i in (length(city) - 1):1 ) {
n = length(city[[i]]$City) # 统计的市
temp.dat = data.frame('省' = rep(NA, n),
'市' = rep(NA, n),
'确诊' = rep(NA, n),
'死亡' = rep(NA, n),
'治愈' = rep(NA, n),
'病重' = rep(NA, n),
'Critical' = rep(NA, n),
'日期' = city[[i]]$Time)

print(c( city[[i]]$Time, n) )

temp = city[[i]]$CityDetail
for (j in 1:length(temp)) {
temp.dat[j, 1:7] = unlist( temp[[j]] )
}
dat = rbind(dat, temp.dat)

}

# 对每个省统计
dat$`省确诊` = NA
dat$`省死亡` = NA
dat$`省治愈` = NA
dat$`省病重` = NA
dat$`省Critical` = NA

for (i in unique(dat$日期)) {
index = which(dat$日期 == i)
dat$省确诊[index] = sum(dat$确诊[index])
}

webdriver爬虫备份

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from time import sleep
import pandas as pd
import os
import re
import json


def get_county(url):
driver = webdriver.Chrome()
driver.get(url)
sleep(10)
# 所有state名称
temp = driver.find_element_by_xpath('//*[@id="nav-tabpanel-0"]/div/table/tbody')
state = temp.find_elements_by_tag_name('th')
print( '美国州个数为:' + str(len(state)) )
state = [i.text for i in state]
state = state[0:51] # 第五十二个州?没有数据,先删除
county = pd.DataFrame()

# 每个county
for i in range(len(state)):
# 下一个网址
xpath = '//*[@id="nav-tabpanel-0"]/div/table/tbody/tr[' + str(i + 1) + ']/th'
temp = driver.find_element_by_xpath(xpath)
ActionChains(driver).click(temp).perform()
# 点开城市,统计county
temp = driver.find_element_by_xpath('//*[@id="nav-tabpanel-0"]/div/table/tbody')
name = temp.find_elements_by_tag_name('th')
name = [i.text for i in name]
name = pd.DataFrame(name)
name['state'] = state[i]
name['url'] = re.sub('state', 'county', driver.current_url) + '/' + name[0]
name['url'] = [re.sub(' ', '%20', i) for i in name['url']]
county = county.append(name)
# 返回主网页
print('第' + str(i +1) + '个城市' + state[i] + '已经copy完毕')
driver.back()
sleep(30)
# 保存
county.to_csv('county.csv', index = False)
driver.close()



def download(county):
options = webdriver.ChromeOptions()
prefs = {'profile.default_content_settings.popups': 0, 'download.default_directory': os.getcwd()}
options.add_experimental_option('prefs', prefs)
driver = webdriver.Chrome(chrome_options = options)
print('一共' + str(len(county)) + '个county')
driver.set_window_size(400,400)
error = list()
# 开始下载
for i in range(1532, len(county)): # 中途出错了,从157开始 range(len(county))
url = county['url'][i]
driver.get(url)
# 下载
sleep(5)
try:
temp = driver.find_element_by_xpath('//*[@id="nav-tabpanel-0"]/div/div[3]/div[4]/p')
ActionChains(driver).click(temp).perform()
# 点ok
temp = driver.find_element_by_xpath('/html/body/div[2]/div[3]/div/div[3]/button/span[1]')
ActionChains(driver).click(temp).perform()
# 重命名
newname = county.iat[i, 0] + "_" + county.iat[i, 1] + '.json'
os.rename('covid-data.json', newname)
print('已经下载完第' + str(i + 1) + '个城市的json文件')
print('请不要碰chrome,请不要碰chrome,请不要碰chrome,重要的事情说三遍!!!')
except:
print('第' + str(i) + '个城市下载出错')
error.append(i)
continue
print('已经全部下载完毕')
driver.close()
return error

def clean(county):
# 清洗json文件
dat = pd.DataFrame()
for i in range(len(county)):
if(i%100 == 0): print('now ' + str(i))
filename = './data/' + county.iat[i, 0] + '_' + county.iat[i, 1] + '.json'
df = pd.read_json(filename, encoding="utf-8", orient='records')
df['state'] = county.iat[i, 1]
df['county'] = county.iat[i, 0]
dat = pd.concat([dat, df], sort = False)
return dat


# 获取主网页
url = 'https://covid-19.direct/US'

get_county(url)

county = pd.read_csv('county.csv')

print('county的数量为:' + str(len(county)))

error = download(county)

print(error)

dat = clean(county)

dat.to_csv('dat.csv', index = False)