1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
| """ author: cfl 获取医生列表 """ import requests from bs4 import BeautifulSoup import re import csv import time import threading import os from random import randint
os.chdir('D:/华统/爬虫好大夫')
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) ' 'AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/78.0.3904.108 Safari/537.36'}
def read_doc(pathname): hospital_all_href_list = [] office_all_href_list = [] hospital_office_all_reader_list = csv.reader(open(pathname + 'office.csv', 'r')) n = 0 for j in hospital_office_all_reader_list: if n != 0: hospital_all_href_list.append(j[0]) office_all_href_list.append(j[1]) n += 1
try: office_finish_href_list = [] office_finish_reader_list = csv.reader(open(pathname + 'doctor_finish.csv', 'r')) for j in office_finish_reader_list: office_finish_href_list.append(j[0]) office_finish_href_set = set(office_finish_href_list) except: office_finish_href_list = [] office_finish_href_set = set(office_finish_href_list)
hospital_href_list = [] office_href_list = [] for j in range(len(office_all_href_list)): if office_all_href_list[j] not in office_finish_href_set: hospital_href_list.append(hospital_all_href_list[j]) office_href_list.append(office_all_href_list[j])
return hospital_href_list, office_href_list
def get_doctor(pathname, hospital_href, office_href): stop = False pageNo = 0 while not stop: pageNo += 1 print(pageNo) url = 'https://www.haodf.com/hospital/%s/%s/daifu.htm?p=%s' % (hospital_href, office_href, str(pageNo)) res = requests.get(url, headers=headers) html = res.text html = BeautifulSoup(html, 'lxml') try: table_doctor = html.findAll('table', attrs={'id': 'doc_list_index'})[0]
td_doctor_list = table_doctor.findAll('td', attrs={'class': 'tda'}) doctor_href_list = [] doctor_home_list = [] doctor_name_list = [] for k in td_doctor_list: doctor_href = re.findall(r'href="//(.+?)/"', str(k))[0] doctor_home = re.findall(r'//(.+?)\.haodf\.com', str(k))[1] doctor_name = re.findall(r'">(.+?)</a>', str(k))[0] doctor_href_list.append(doctor_href) doctor_home_list.append(doctor_home) doctor_name_list.append(doctor_name) write_doc(pathname, office_href, doctor_href_list, doctor_home_list, doctor_name_list) except: break
def write_table(pathname): with open(pathname + 'doctor.csv', 'w') as doctor: doctor.write(','.join(['office_href', 'doctor_href', 'doctor_room', 'doctor_name']))
def write_doc(pathname, office_href, doctor_href_list, doctor_room_list, doctor_name_list): with open(pathname + 'doctor.csv', 'a') as doctor: for j in range(len(doctor_href_list)): doctor.write('\n' + ','.join([office_href, doctor_href_list[j], doctor_room_list[j], doctor_name_list[j]]))
my_pathname = '../data/'
write_table(my_pathname)
hospital_href_list, office_href_list = read_doc(my_pathname)
for i in range(len(hospital_href_list)): get_doctor(my_pathname, hospital_href_list[i], office_href_list[i])
|