记录下爬虫的学习内容。爬虫是从网页上爬取内容
学习网站
- http://c.biancheng.net/view/2011.html
- https://www.jianshu.com/p/0c0cb9867b44

网页的知识

网页一般由三部分组成，分别是 HTML（超文本标记语言）、CSS（层叠样式表）和 JScript（活动脚本语言）
打开网页，ctrl + U可以快速查看内容
有时候ctrl+U没有用，是因为加了<script>，这个要么解JS、要么webdriver
还有种方法，点F12

HTML

主要看菜鸟教程
这里特别要注意网页有没有加<iframe>，这里吃了大亏==

<html>..</html> 表示标记中间的元素是网页
<body>..</body> 表示用户可见的内容
<div>..</div> 表示框架
<p>..</p> 表示段落
<li>..</li>表示列表
<img>..</img>表示图片
<h1>..</h1>表示标题
<a href="">..</a>表示超链接

爬虫合法性

几乎每一个网站都有一个名为robots.txt 的文档

robots.txt百度百科

Sitemap是网站上有哪些可供抓取的网页

Python

主要参考来源CSDN

1
2
3

import requests
res = requests.get(url, headers, )
BeautifulSoup

Beautiful Soup 解析网页，官网

1 2	from bs4 import BeautifulSoup soup = BeautifulSoup(res.text,'html.parser')

R语言爬虫

library(xml2) #read_html
library(rvest) # html_nodes
web = read_html()
web.html = html_nodes(web, 'xxxx')
web.test = html_text(web.html)

webdriver爬虫

这个是我2020年4月爬美国肺炎用的==，因为解决不了noscript的问题
要记得匹配chrome webdriver和chrome的版本
Selenium基础
xpath可以点网页某个地方，鼠标右键检查，之后会出现网页的开发者模式，在开发者模式中点右键，查看xpath位置，就是可以定位了

下面是实际操控备份的代码

爬取好大夫

查看合法性：https://www.haodf.com/robots.txt

User-agent: *
Disallow: /chaos/*
Disallow: /api/*
Disallow: /index/*
Disallow: /bingli/*
Disallow: /message/*
Disallow: /passport/*
Disallow: /senderr/*
Sitemap: http://www.haodf.com/sitemap.xml

从https://www.haodf.com/sitemap.html中查看可爬取的内容

https://blog.csdn.net/qq_27302597/article/details/79411808

1	import execjs

js解码

def get_521_content(url):
    req = requests.get(url, headers=headers, timeout=5)
    print(req.status_code,req.text)
    if req.status_code == 521:
        cookies = dict(req.cookies.items())
        print(cookies)
        js_con = ''.join(re.findall('<script>(.*?)</script>', req.text))
        if js_con:
            __jsl_clearance = fixed_fun(js_con, url)
            if __jsl_clearance:
                key, value = __jsl_clearance.split('=')
                cookies[key] = value
                return cookies

# 执行js代码获取cookies 的__jsl_clearance的键值
def fixed_fun(js_con, url):  # js_con 第一次请求获取的js内容
 
    func_return = js_con.replace('eval(', 'return(')
    print('第一次替换eval==》return后：  ', func_return)
    content = execjs.compile(func_return)
    fn = js_con.split('=')[0].split(' ')[1]
    evaled_func = content.call(fn)
    print('第一次执行js代码后： ', evaled_func)
    fn = evaled_func.split('=')[0].split(' ')[1]  # 获取动态函数名
    aa = evaled_func.split("<a href=\\'/\\'>")  # 获取<a>标签的内容
    aa = aa[1].split("</a>")[0] if len(aa) >= 2 else  ''
    mode_func = evaled_func. \
        replace(
        "setTimeout('location.href=location.pathname+location.search.replace(/[\\?|&]captcha-challenge/,\\'\\')',1500);document.cookie=",
        'return'). \
        replace(';if((function(){try{return !!window.addEventListener;}', ''). \
        replace(
        "}catch(e){return false;}})()){document.addEventListener('DOMContentLoaded'," + fn + ",false)}else{document.attachEvent('onreadystatechange'," + fn + ")",
        ''). \
        replace(
        "if((function(){try{return !!window.addEventListener;}catch(e){return false;}})()){document.addEventListener('DOMContentLoaded'," + fn + ",false)}else{document.attachEvent('onreadystatechange'," + fn + ")",
        ''). \
        replace("return'__jsl_clearance", "var window={};return '__jsl_clearance"). \
        replace(
        "var " + fn + "=document.createElement('div');" + fn + ".innerHTML='<a href=\\'/\\'>" + aa + "</a>';" + fn + "=" + fn + ".firstChild.href",
        "var " + fn + "='" + url + "'")
    print('第二次替换后的js代码：', mode_func)
    try:
        content = execjs.compile(mode_func)
        cookies = content.call(fn)
        __jsl_clearance = cookies.split(';')[0]
        print(__jsl_clearance)
        return __jsl_clearance
    except:
        print('js执行错误:', mode_func)
        return None
 
# 携带解密后的cookies第二次爬取详情页
def con_spider(cookies, url):
    response = requests.get(url, headers=headers, cookies=cookies, timeout=5)
    if response.status_code == 200:
        response.encoding = 'utf-8'
        print(response.status_code)
        #print(response.text)
        return response
    else:
        print('第二次爬取错误状态码：', response.status_code)
        return None

cookies = get_521_content(url)
con_spider(cookies, url)

获取医院

"""
author:cfl
    获取医院列表.
"""
import requests
from bs4 import BeautifulSoup
import re
import csv


headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) '
                         'AppleWebKit/537.36 (KHTML, like Gecko) '
                         'Chrome/78.0.3904.108 Safari/537.36'}

# 获取全部省
def get_all_province(url):
    # 获取页面内容
    res = requests.get(url, headers=headers)
    html = res.text
    html = BeautifulSoup(html, 'lxml')

    # 获取全部省
    div_province = html.findAll('div', attrs={'class': 'ct'})[0]
    div_province_list = div_province.findAll('div', attrs={'class': re.compile(r'kstl')})
    province_name_list = []
    province_href_list = []
    for j in div_province_list:
        name = re.findall(r'>(.+?)<', str(j))[0]
        href = re.findall(r'yiyuan/(.+?)/', str(j))[0]
        province_name_list.append(name)
        province_href_list.append(href)

    return province_name_list, province_href_list


# 获取医院
def get_hospital_id(pathname,
                    province_name, province_href):
    url = 'https://www.haodf.com/yiyuan/%s/list.htm' % province_href

    # 获取页面内容
    res = requests.get(url, headers=headers)
    html = res.text
    html = BeautifulSoup(html, 'lxml')

    # 当前省的全部城市
    div_city = html.findAll('div', attrs={'class': 'ct'})[1]

    # 全部城市名
    div_city_name_list = div_city.findAll('div', attrs={'class': 'm_title_green'})
    city_name_list = []
    for j in div_city_name_list:
        city_name = re.findall(r'>(.+?)<', str(j))[0]
        city_name_list.append(city_name)

    # 按区域获取医院
    hospital_address_list = []
    hospital_name_list = []
    hospital_href_list = []
    for j in range(len(city_name_list)):
        # 区域
        div_district = div_city.findAll('div', attrs={'class': 'm_ctt_green'})[j]
        a_list = div_district.findAll('a', attrs={'href': re.compile(r'hospital')})
        for k in a_list:
            name = re.findall(r'>(.+?)<', str(k))[0]
            href = re.findall(r'hospital/(.+?)\.htm', str(k))[0]
            hospital_address_list.append(province_name + city_name_list[j])
            hospital_name_list.append(name)
            hospital_href_list.append(href)
        # 写入医院数据
    write_doc(pathname,hospital_address_list, hospital_href_list, hospital_name_list)
    
# 'hospital_address', 'hospital_href', 'hospital_name'
def write_doc(pathname,
              hospital_address_list, 
              hospital_href_list, 
              hospital_name_list):
    # 医院
    with open(pathname + 'hospital.csv', 'a') as hospital:
        # 写入行数据
        for j in range(len(hospital_name_list)):
            hospital.write('\n'+','.join([hospital_address_list[j],hospital_href_list[j],hospital_name_list[j]]))


if __name__ == '__main__':
    my_pathname = '../data/'
    with open(my_pathname + 'hospital.csv', 'w') as hospital:
        hospital.write(','.join(['address','href','name']))
    # 全部医院列表
    my_url = 'https://www.haodf.com/yiyuan/all/list.htm'
    # 省名称、链接
    my_province_name_list, my_province_href_list = get_all_province(my_url)
    for i in range(len(my_province_name_list)):
        get_hospital_id(my_pathname, my_province_name_list[i], my_province_href_list[i])

获取科室

"""
author: cfl
    获取科室列表
"""
import requests
from bs4 import BeautifulSoup
import re
import csv
import time
import threading
import os
from random import randint


headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) '
                         'AppleWebKit/537.36 (KHTML, like Gecko) '
                         'Chrome/78.0.3904.108 Safari/537.36'}


def read_doc(pathname):
    # 全列表
    hospital_all_href_list = []
    hospital_all_name_list = []
    hospital_all_reader_list = csv.reader(open(pathname + 'hospital.csv', 'r'))
    # 删掉第一行
    n = 0
    for j in hospital_all_reader_list:
        if n != 0:
            hospital_all_href_list.append(j[1])
            hospital_all_name_list.append(j[2])
        n += 1

    # 已完成列表
    try:
        office_finish_href_list = []
        office_finish_reader_list = csv.reader(open(pathname + 'office_finish.csv', 'r'))
        for j in office_finish_reader_list:
            office_finish_href_list.append(j[0])
        office_finish_href_set = set(office_finish_href_list)
    except:
        office_finish_href_list = []
        office_finish_href_set = set(office_finish_href_list)

    # 待爬取列表
    hospital_href_list = []
    hospital_name_list = []
    
    for j in range(len(hospital_all_href_list)):
        if hospital_all_href_list[j] not in office_finish_href_set:
            hospital_href_list.append(hospital_all_href_list[j])
            hospital_name_list.append(hospital_all_name_list[j])
    return hospital_href_list,hospital_name_list


def get_office(pathname, href, Name):
    url = 'https://www.haodf.com/hospital/%s/daifu.htm' % href
    # 获取页面内容
    res = requests.get(url, headers=headers)
    html = res.text
    html = BeautifulSoup(html, 'lxml')

    # 科室类型
    div_office_type = html.findAll('div', attrs={'class': 'intro_doc-nav-mod'})[0]

    # 全部科室类型名
    div_office_type_name_list = div_office_type.findAll('div', attrs={'class': 'de_title-mod'})
    office_type_name_list = []
    for j in div_office_type_name_list:
        office_type_name = re.findall(r'>(.+?)<', str(j))[0]
        office_type_name_list.append(office_type_name)

    # 按科室类型获取科室
    for j in range(len(office_type_name_list)):
        # 科室
        ul_office = div_office_type.findAll('ul', attrs={'class': 'de_content-mod'})[j]
        a_list = ul_office.findAll('a', attrs={'href': re.compile(r'daifu')})

        office_href_list = []
        office_name_list = []
        for k in a_list:
            office_href = re.findall(r'hospital/.+?/(.+?)/daifu\.htm', str(k))[0]
            name = re.findall(r'>(.+?)<', str(k))[0]
            office_href_list.append(office_href)
            office_name_list.append(Name + '_' + office_type_name_list[j] + '_' + name)
        # 写入医院数据
        write_doc(pathname,
                  href, office_href_list, office_name_list)


def write_table(pathname):
   with open(pathname + 'office.csv', 'w') as office:
       office.write(','.join(['address', 'office_href', 'office_name']))
   
# 'href', 'office'
def write_doc(pathname,
              href, office_href_list, office_name_list):
    # 医院细节信息
    with open(pathname + 'office.csv', 'a') as office:
        # 写入行数据
        for j in range(len(office_href_list)):
            office.write('\n'+','.join([href, office_href_list[j], office_name_list[j]]))


my_pathname = '../data/'
# 写入表头
write_table(my_pathname)
# 获取待爬取列表
href,name = read_doc(my_pathname)
error = []
# 记录一家医院，协和医院210
for i in range(len(href)):
    time.sleep(randint(2, 5))
    try:
        get_office(my_pathname, href[i], name[i])
    except:
        error = error.append(i)

爬取医生信息

"""
author: cfl
    获取医生列表
"""
import requests
from bs4 import BeautifulSoup
import re
import csv
import time
import threading
import os
from random import randint

os.chdir('D:/华统/爬虫好大夫')

headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) '
                         'AppleWebKit/537.36 (KHTML, like Gecko) '
                         'Chrome/78.0.3904.108 Safari/537.36'}

def read_doc(pathname):
    # 全列表
    hospital_all_href_list = []
    office_all_href_list = []
    hospital_office_all_reader_list = csv.reader(open(pathname + 'office.csv', 'r'))
    # 删掉第一行
    n = 0
    for j in hospital_office_all_reader_list:
        if n != 0:
            hospital_all_href_list.append(j[0])
            office_all_href_list.append(j[1])
        n += 1

    # 已完成列表
    try:
        office_finish_href_list = []
        office_finish_reader_list = csv.reader(open(pathname + 'doctor_finish.csv', 'r'))
        for j in office_finish_reader_list:
            office_finish_href_list.append(j[0])
        office_finish_href_set = set(office_finish_href_list)
    except:
        office_finish_href_list = []
        office_finish_href_set = set(office_finish_href_list)

    # 待爬取列表
    hospital_href_list = []
    office_href_list = []
    for j in range(len(office_all_href_list)):
        if office_all_href_list[j] not in office_finish_href_set:
            hospital_href_list.append(hospital_all_href_list[j])
            office_href_list.append(office_all_href_list[j])

    return hospital_href_list, office_href_list


def get_doctor(pathname, hospital_href, office_href):
    stop   = False
    pageNo = 0
    while not stop:
        pageNo += 1
        print(pageNo)
        url = 'https://www.haodf.com/hospital/%s/%s/daifu.htm?p=%s' % (hospital_href, office_href, str(pageNo))
        res = requests.get(url, headers=headers)
        html = res.text
        html = BeautifulSoup(html, 'lxml')
        # 医生列表
        try:
            table_doctor = html.findAll('table', attrs={'id': 'doc_list_index'})[0]

            # 全部医生
            td_doctor_list = table_doctor.findAll('td', attrs={'class': 'tda'})
            doctor_href_list = []
            doctor_home_list = []
            doctor_name_list = []
            for k in td_doctor_list:
                doctor_href = re.findall(r'href="//(.+?)/"', str(k))[0]
                doctor_home = re.findall(r'//(.+?)\.haodf\.com', str(k))[1]
                doctor_name = re.findall(r'">(.+?)</a>', str(k))[0]
                doctor_href_list.append(doctor_href)
                doctor_home_list.append(doctor_home)
                doctor_name_list.append(doctor_name)
            write_doc(pathname,
                      office_href, doctor_href_list, doctor_home_list, doctor_name_list)
        except:
            break

def write_table(pathname):
     with open(pathname + 'doctor.csv', 'w') as doctor:
         doctor.write(','.join(['office_href', 'doctor_href', 'doctor_room', 'doctor_name']))


def write_doc(pathname,
              office_href, 
              doctor_href_list, 
              doctor_room_list, 
              doctor_name_list):
    with open(pathname + 'doctor.csv', 'a') as doctor:
        for j in range(len(doctor_href_list)):
            doctor.write('\n' + ','.join([office_href, doctor_href_list[j], doctor_room_list[j], doctor_name_list[j]]))

my_pathname = '../data/'

# 写入表头
write_table(my_pathname)
# 获取待爬取列表
hospital_href_list, office_href_list = read_doc(my_pathname)

for i in range(len(hospital_href_list)):
    get_doctor(my_pathname, hospital_href_list[i], office_href_list[i])

R语言爬虫备份

library(xml2) #read_html
library(rvest) # html_nodes
library(dplyr)
library(stringr)
library(rjson)

# Sys.sleep(5) # 系统沉睡5秒，防止过度占比服务器

web = read_html('http://ncov.nosensor.com:8080/api/', encoding = 'UTF-8')

web.html = html_text(web)

text = fromJSON(web.html)

city = text$city # 这里取每个市


length(city) # 有多少天

n = length(city[[24]]$City) # 统计的市
dat = data.frame('省' = rep(NA, n),
                 '市' = rep(NA, n),
                 '确诊' = rep(NA, n),
                 '死亡' = rep(NA, n),
                 '治愈' = rep(NA, n),
                 '病重' = rep(NA, n),
                 'Critical' = rep(NA, n),
                 '日期' = city[[24]]$Time)

temp = city[[24]]$CityDetail

for (j in 1:length(temp)) {
  dat[j, 1:7] = unlist( temp[[j]] )
}


for (i in (length(city) - 1):1 ) {
  n = length(city[[i]]$City) # 统计的市
  temp.dat = data.frame('省' = rep(NA, n),
                        '市' = rep(NA, n),
                        '确诊' = rep(NA, n),
                        '死亡' = rep(NA, n),
                        '治愈' = rep(NA, n),
                        '病重' = rep(NA, n),
                        'Critical' = rep(NA, n),
                        '日期' = city[[i]]$Time)
  
  print(c( city[[i]]$Time, n) )
  
  temp = city[[i]]$CityDetail
  for (j in 1:length(temp)) {
    temp.dat[j, 1:7] = unlist( temp[[j]] )
  }
  dat = rbind(dat, temp.dat)
  
}

# 对每个省统计
dat$`省确诊` = NA
dat$`省死亡` = NA
dat$`省治愈` = NA
dat$`省病重` = NA
dat$`省Critical` = NA

for (i in unique(dat$日期)) {
  index = which(dat$日期 == i)
  dat$省确诊[index] = sum(dat$确诊[index])
}

webdriver爬虫备份

from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from time import sleep
import pandas as pd
import os
import re
import json


def get_county(url):
    driver = webdriver.Chrome()
    driver.get(url)
    sleep(10)
    # 所有state名称
    temp = driver.find_element_by_xpath('//*[@id="nav-tabpanel-0"]/div/table/tbody')
    state = temp.find_elements_by_tag_name('th')
    print( '美国州个数为：' + str(len(state)) )
    state = [i.text for i in state]
    state = state[0:51] # 第五十二个州？没有数据，先删除
    county = pd.DataFrame()
    
    # 每个county
    for i in range(len(state)):
        # 下一个网址
        xpath = '//*[@id="nav-tabpanel-0"]/div/table/tbody/tr[' + str(i + 1) + ']/th'
        temp = driver.find_element_by_xpath(xpath)
        ActionChains(driver).click(temp).perform()
        # 点开城市，统计county
        temp = driver.find_element_by_xpath('//*[@id="nav-tabpanel-0"]/div/table/tbody')
        name = temp.find_elements_by_tag_name('th')
        name = [i.text for i in name]
        name = pd.DataFrame(name)
        name['state'] = state[i]
        name['url'] = re.sub('state', 'county', driver.current_url) + '/' + name[0]
        name['url'] = [re.sub(' ', '%20', i) for i in name['url']]
        county = county.append(name)
    # 返回主网页
    print('第' + str(i +1) + '个城市' + state[i] + '已经copy完毕')
    driver.back()
    sleep(30)
    # 保存
    county.to_csv('county.csv', index = False)
    driver.close()



def download(county):
    options = webdriver.ChromeOptions()
    prefs = {'profile.default_content_settings.popups': 0, 'download.default_directory': os.getcwd()}
    options.add_experimental_option('prefs', prefs)
    driver = webdriver.Chrome(chrome_options = options)
    print('一共' + str(len(county)) + '个county')
    driver.set_window_size(400,400)
    error = list()
    # 开始下载
    for i in range(1532, len(county)): # 中途出错了，从157开始 range(len(county))
        url = county['url'][i]
        driver.get(url)
        # 下载
        sleep(5)
        try:
            temp = driver.find_element_by_xpath('//*[@id="nav-tabpanel-0"]/div/div[3]/div[4]/p')
            ActionChains(driver).click(temp).perform()
            # 点ok
            temp = driver.find_element_by_xpath('/html/body/div[2]/div[3]/div/div[3]/button/span[1]')
            ActionChains(driver).click(temp).perform()
            # 重命名
            newname = county.iat[i, 0] + "_" + county.iat[i, 1] + '.json'
            os.rename('covid-data.json', newname)
            print('已经下载完第' + str(i + 1) + '个城市的json文件')
            print('请不要碰chrome，请不要碰chrome，请不要碰chrome，重要的事情说三遍！！！')
        except:
            print('第' + str(i) + '个城市下载出错')
            error.append(i)
            continue
    print('已经全部下载完毕')
    driver.close()
    return error

def clean(county):
    # 清洗json文件
    dat = pd.DataFrame() 
    for i in range(len(county)):
        if(i%100 == 0): print('now ' + str(i))
        filename = './data/' + county.iat[i, 0] + '_' + county.iat[i, 1] + '.json'
        df = pd.read_json(filename, encoding="utf-8", orient='records')
        df['state'] = county.iat[i, 1]
        df['county'] = county.iat[i, 0]
        dat = pd.concat([dat, df], sort = False)
    return dat


# 获取主网页
url = 'https://covid-19.direct/US'    

get_county(url)

county = pd.read_csv('county.csv')

print('county的数量为:' + str(len(county)))

error = download(county)

print(error)

dat = clean(county)

dat.to_csv('dat.csv', index = False)