快速问医生(ask120.com) |
您所在的位置:网站首页 › 快速问医生在线咨询的问题 › 快速问医生(ask120.com) |
数据爬取
#导入所需包
import requests
from bs4 import BeautifulSoup
from lxml import etree
import time
import csv
fp = open('内分泌.csv','a',newline='',encoding='utf_8_sig')
writer = csv.writer(fp)
headers={
'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:70.0) Gecko/20100101 Firefox/70.0'
}
#网页内容格式调整
#内分泌科室在线问诊网址
url = 'https://www.120ask.com/list/nfmk/'
#此处爬取2-101页,一共200页
for i in range(2,102):
r = url+str(i)+'/'
html = requests.get(r,headers=headers)
html.raise_for_status()
html.encoding = html.apparent_encoding
soup = BeautifulSoup(html.text,'html.parser')
#爬取所有一级链接,以进入详细页面
for item in soup.find_all('p','h-pp1'):
link = 'https:'+ item.find('a','q-quename')['href']
date_html=requests.get(link,headers=headers).text
f = etree.HTML(date_html)
#提问者性别与年龄
ques_gender_age = f.xpath('/html/body/div[1]/div[5]/div[2]/div[3]/div[1]/div/span[1]/text()')[0]
#问题描述
ques_des = f.xpath('/html/body/div[1]/div[5]/div[2]/div[3]/div[2]/p[1]/text()')
ques_des= [''.join(x.split()) for x in ques_des]
while ques_des.count(''):
ques_des.remove('')
ques_des = ques_des[0]
#print(ques_des)
#回复时间
ans_time =f.xpath('normalize-space(/html/body/div[1]/div[5]/div[2]/div[7]/div[1]/div[2]/div[2]/span/text())')
#print(ans_time)
#回复者职称
anser_position = f.xpath('/html/body/div[1]/div[5]/div[2]/div[7]/div[1]/div[1]/div/span[1]/text()')
anser_position= [''.join(x.split()) for x in anser_position]
while anser_position.count(''):
anser_position.remove('')
#anser_position = anser_position[0]
#print(anser_position)
#回复者擅长领域
anser_good_at = f.xpath('/html/body/div[1]/div[5]/div[2]/div[7]/div[1]/div[1]/div/span[2]/text()')
#print(anser_good_at)
#回复内容
anser_content =f.xpath('/html/body/div[1]/div[5]/div[2]/div[7]/div[1]/div[2]/div[2]/div[1]/div[1]/p/text()')
anser_content= [''.join(x.split()) for x in anser_content]
#print(anser_content)
#提问时间
release_time = f.xpath('//*[@id="body_main"]/div[5]/div[2]/div[3]/div[1]/div/span[2]/text()')[0]
#print(release_time)
#回复者诊疗经验
anser_help_amout = f.xpath('//*[@id="body_main"]/div[5]/div[2]/div[7]/div[1]/div[1]/div/span[3]/text()')
anser_help_amout = [''.join(x.split()) for x in anser_help_amout]
while anser_help_amout.count(''):
anser_help_amout.remove('')
#print(anser_help_amout)
writer.writerow((release_time,ques_gender_age,ques_des,ans_time,anser_position,anser_good_at,anser_help_amout,anser_content))
fp.close()
for i in range(2,102):
r = url+str(i)+'/'
html = requests.get(r,headers=headers)
html.raise_for_status()
html.encoding = html.apparent_encoding
soup = BeautifulSoup(html.text,'html.parser')
#f1 = etree.HTML(html)
for item in soup.find_all('p','h-pp1'):
#问题标题
ques_title = item.find('a','q-quename')['title']
#所在科室
department = item.find('a').get_text()
writer.writerow((department,ques_title))
fp.close()
for i in range(2,102):
r = url+str(i)+'/'
html = requests.get(r,headers=headers)
html.raise_for_status()
html.encoding = html.apparent_encoding
soup = BeautifulSoup(html.text,'html.parser')
for item in soup.find_all('div','fr h-right-p'):
ans_amout1= item.find_next()
#回复状态(预测变量)
ans_status = ans_amout1.find_next().get_text()
#回复数
ans_amout = ans_amout1.get_text()
writer.writerow((ans_amout,ans_status))
fp.close()
以上爬取结束后,通过excel进行整合 The shape of our features is: (2414, 12) # Descriptive statistics for each column features.describe()Training Features Shape: (1448, 11) Training Labels Shape: (1448,) Testing Features Shape: (966, 11) Testing Labels Shape: (966,) 建立一个基础的随机森林模型 from sklearn.ensemble import RandomForestClassifier rf = RandomForestClassifier(n_estimators= 10, random_state=44) rf.fit(train_features, train_labels) predictions = rf.predict(test_features) correct = [1 if ((a == 1 and b == 1) or (a == 0 and b == 0)) else 0 for (a, b) in zip(predictions,test_labels)] accuracy = (sum(map(int, correct)))/966 print ('accuracy = {0}%'.format(accuracy))accuracy = 0.860248447204969% 可视化展示树 from sklearn.tree import export_graphviz import pydot tree = rf.estimators_[0] export_graphviz(tree, out_file = 'tree.dot', feature_names = feature_list, rounded = True, precision = 1) (graph, ) = pydot.graph_from_dot_file('tree.dot') graph.write_png('tree.png');
Variable: reply_speed Importance: 0.18 Variable: doc_help_amount Importance: 0.16 Variable: reply_content Importance: 0.16 Variable: age Importance: 0.15 Variable: reply_amount Importance: 0.12 Variable: department Importance: 0.08 Variable: hospital_level Importance: 0.05 Variable: doc_position Importance: 0.04 Variable: sex Importance: 0.03 Variable: doc_liked Importance: 0.03 Variable: doc_good_at Importance: 0.02 import matplotlib.pyplot as plt x_values = list(range(len(importances))) plt.bar(x_values, importances, orientation = 'vertical') plt.xticks(x_values, feature_list, rotation='vertical') plt.ylabel('Importance'); plt.xlabel('Variable'); plt.title(' Random Forest Variable Importances');accuracy = 0.8643892339544513% 特征重要性 importances = list(gbdt.feature_importances_) feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)] feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True) [print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];Variable: reply_amount Importance: 0.58 Variable: department Importance: 0.18 Variable: age Importance: 0.13 Variable: reply_speed Importance: 0.07 Variable: doc_help_amount Importance: 0.03 Variable: sex Importance: 0.0 Variable: hospital_level Importance: 0.0 Variable: doc_position Importance: 0.0 Variable: doc_good_at Importance: 0.0 Variable: doc_liked Importance: 0.0 Variable: reply_content Importance: 0.0 x_values = list(range(len(importances))) plt.bar(x_values, importances, orientation = 'vertical') plt.xticks(x_values, feature_list, rotation='vertical') plt.ylabel('Importance'); plt.xlabel('Variable'); plt.title('GBDT Variable Importances');最优参数:{‘penalty’: ‘l2’, ‘max_iter’: 100, ‘C’: 0.1, ‘fit_intercept’: False} lr = LogisticRegression(C = 0.1, fit_intercept=False, max_iter=100, penalty='l2', random_state=0) lr.fit(train_features, train_labels) predictions = lr.predict(test_features) correct = [1 if ((a == 1 and b == 1) or (a == 0 and b == 0)) else 0 for (a, b) in zip(predictions,test_labels)] accuracy = (sum(map(int, correct)))/966 print ('accuracy = {0}%'.format(accuracy))accuracy = 0.8633540372670807% 分析小结 年龄与其他因素相关性![]() |
今日新闻 |
推荐新闻 |
CopyRight 2018-2019 办公设备维修网 版权所有 豫ICP备15022753号-3 |