快速问医生(ask120.com)

您所在的位置:网站首页 快速问医生在线咨询的问题 快速问医生(ask120.com)

快速问医生(ask120.com)

2024-07-10 12:18| 来源: 网络整理| 查看: 265

数据爬取 #导入所需包 import requests from bs4 import BeautifulSoup from lxml import etree import time import csv fp = open('内分泌.csv','a',newline='',encoding='utf_8_sig') writer = csv.writer(fp) headers={ 'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:70.0) Gecko/20100101 Firefox/70.0' } #网页内容格式调整 #内分泌科室在线问诊网址 url = 'https://www.120ask.com/list/nfmk/' #此处爬取2-101页,一共200页 for i in range(2,102): r = url+str(i)+'/' html = requests.get(r,headers=headers) html.raise_for_status() html.encoding = html.apparent_encoding soup = BeautifulSoup(html.text,'html.parser') #爬取所有一级链接,以进入详细页面 for item in soup.find_all('p','h-pp1'): link = 'https:'+ item.find('a','q-quename')['href'] date_html=requests.get(link,headers=headers).text f = etree.HTML(date_html) #提问者性别与年龄 ques_gender_age = f.xpath('/html/body/div[1]/div[5]/div[2]/div[3]/div[1]/div/span[1]/text()')[0] #问题描述 ques_des = f.xpath('/html/body/div[1]/div[5]/div[2]/div[3]/div[2]/p[1]/text()') ques_des= [''.join(x.split()) for x in ques_des] while ques_des.count(''): ques_des.remove('') ques_des = ques_des[0] #print(ques_des) #回复时间 ans_time =f.xpath('normalize-space(/html/body/div[1]/div[5]/div[2]/div[7]/div[1]/div[2]/div[2]/span/text())') #print(ans_time) #回复者职称 anser_position = f.xpath('/html/body/div[1]/div[5]/div[2]/div[7]/div[1]/div[1]/div/span[1]/text()') anser_position= [''.join(x.split()) for x in anser_position] while anser_position.count(''): anser_position.remove('') #anser_position = anser_position[0] #print(anser_position) #回复者擅长领域 anser_good_at = f.xpath('/html/body/div[1]/div[5]/div[2]/div[7]/div[1]/div[1]/div/span[2]/text()') #print(anser_good_at) #回复内容 anser_content =f.xpath('/html/body/div[1]/div[5]/div[2]/div[7]/div[1]/div[2]/div[2]/div[1]/div[1]/p/text()') anser_content= [''.join(x.split()) for x in anser_content] #print(anser_content) #提问时间 release_time = f.xpath('//*[@id="body_main"]/div[5]/div[2]/div[3]/div[1]/div/span[2]/text()')[0] #print(release_time) #回复者诊疗经验 anser_help_amout = f.xpath('//*[@id="body_main"]/div[5]/div[2]/div[7]/div[1]/div[1]/div/span[3]/text()') anser_help_amout = [''.join(x.split()) for x in anser_help_amout] while anser_help_amout.count(''): anser_help_amout.remove('') #print(anser_help_amout) writer.writerow((release_time,ques_gender_age,ques_des,ans_time,anser_position,anser_good_at,anser_help_amout,anser_content)) fp.close() for i in range(2,102): r = url+str(i)+'/' html = requests.get(r,headers=headers) html.raise_for_status() html.encoding = html.apparent_encoding soup = BeautifulSoup(html.text,'html.parser') #f1 = etree.HTML(html) for item in soup.find_all('p','h-pp1'): #问题标题 ques_title = item.find('a','q-quename')['title'] #所在科室 department = item.find('a').get_text() writer.writerow((department,ques_title)) fp.close() for i in range(2,102): r = url+str(i)+'/' html = requests.get(r,headers=headers) html.raise_for_status() html.encoding = html.apparent_encoding soup = BeautifulSoup(html.text,'html.parser') for item in soup.find_all('div','fr h-right-p'): ans_amout1= item.find_next() #回复状态(预测变量) ans_status = ans_amout1.find_next().get_text() #回复数 ans_amout = ans_amout1.get_text() writer.writerow((ans_amout,ans_status)) fp.close()

以上爬取结束后,通过excel进行整合 在这里插入图片描述

数据处理与分析 数据量化

在这里插入图片描述

数据导入 import pandas as pd import xlrd features = pd.read_excel('moredat1.xlsx') features.head(5)

在这里插入图片描述

数据大小 print('The shape of our features is:', features.shape)

The shape of our features is: (2414, 12)

# Descriptive statistics for each column features.describe()

在这里插入图片描述

数据预处理 标签与数据格式转换 import numpy as np labels = np.array(features['adoption']) features= features.drop('adoption', axis = 1) feature_list = list(features.columns) # Convert to numpy array features = np.array(features) 训练集与测试集划分 from sklearn.model_selection import train_test_split # Split the data into training and testing sets train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.4,random_state = 42) print('Training Features Shape:', train_features.shape) print('Training Labels Shape:', train_labels.shape)#1448 print('Testing Features Shape:', test_features.shape) print('Testing Labels Shape:', test_labels.shape)#966

Training Features Shape: (1448, 11) Training Labels Shape: (1448,) Testing Features Shape: (966, 11) Testing Labels Shape: (966,)

建立一个基础的随机森林模型 from sklearn.ensemble import RandomForestClassifier rf = RandomForestClassifier(n_estimators= 10, random_state=44) rf.fit(train_features, train_labels) predictions = rf.predict(test_features) correct = [1 if ((a == 1 and b == 1) or (a == 0 and b == 0)) else 0 for (a, b) in zip(predictions,test_labels)] accuracy = (sum(map(int, correct)))/966 print ('accuracy = {0}%'.format(accuracy))

accuracy = 0.860248447204969%

可视化展示树 from sklearn.tree import export_graphviz import pydot tree = rf.estimators_[0] export_graphviz(tree, out_file = 'tree.dot', feature_names = feature_list, rounded = True, precision = 1) (graph, ) = pydot.graph_from_dot_file('tree.dot') graph.write_png('tree.png');

在这里插入图片描述 树太大了,我们缩减一下深度

rf_small = RandomForestClassifier(n_estimators=10, max_depth = 3, random_state=42) rf_small.fit(train_features, train_labels) tree_small = rf_small.estimators_[0] export_graphviz(tree_small, out_file = 'small_tree.dot', feature_names = feature_list, rounded = True, precision = 1) (graph, ) = pydot.graph_from_dot_file('small_tree.dot') graph.write_png('small_tree.png');

在这里插入图片描述

特征重要性 importances = list(rf.feature_importances_) feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)] feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True) [print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: reply_speed Importance: 0.18 Variable: doc_help_amount Importance: 0.16 Variable: reply_content Importance: 0.16 Variable: age Importance: 0.15 Variable: reply_amount Importance: 0.12 Variable: department Importance: 0.08 Variable: hospital_level Importance: 0.05 Variable: doc_position Importance: 0.04 Variable: sex Importance: 0.03 Variable: doc_liked Importance: 0.03 Variable: doc_good_at Importance: 0.02

import matplotlib.pyplot as plt x_values = list(range(len(importances))) plt.bar(x_values, importances, orientation = 'vertical') plt.xticks(x_values, feature_list, rotation='vertical') plt.ylabel('Importance'); plt.xlabel('Variable'); plt.title(' Random Forest Variable Importances');

在这里插入图片描述

建立一个基础的GDBT模型 from sklearn.ensemble import GradientBoostingClassifier gbdt = GradientBoostingClassifier(min_samples_split=50, n_estimators=10, learning_rate=0.1, random_state=0) gbdt.fit(train_features, train_labels) predictions = gbdt.predict(test_features) correct = [1 if ((a == 1 and b == 1) or (a == 0 and b == 0)) else 0 for (a, b) in zip(predictions,test_labels)] accuracy = (sum(map(int, correct)))/966 print ('accuracy = {0}%'.format(accuracy))

accuracy = 0.8643892339544513%

特征重要性 importances = list(gbdt.feature_importances_) feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)] feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True) [print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: reply_amount Importance: 0.58 Variable: department Importance: 0.18 Variable: age Importance: 0.13 Variable: reply_speed Importance: 0.07 Variable: doc_help_amount Importance: 0.03 Variable: sex Importance: 0.0 Variable: hospital_level Importance: 0.0 Variable: doc_position Importance: 0.0 Variable: doc_good_at Importance: 0.0 Variable: doc_liked Importance: 0.0 Variable: reply_content Importance: 0.0

x_values = list(range(len(importances))) plt.bar(x_values, importances, orientation = 'vertical') plt.xticks(x_values, feature_list, rotation='vertical') plt.ylabel('Importance'); plt.xlabel('Variable'); plt.title('GBDT Variable Importances');

在这里插入图片描述

Logistic模型 最优参数构造 from sklearn.linear_model import LogisticRegression from sklearn.model_selection import GridSearchCV # 首先做一个最优参数的构造 parameters = { "penalty": ['l1', 'l2'], "C": [0.01, 0.1, 1], "fit_intercept": [True, False], "max_iter": [100, 150, 200] } clf = GridSearchCV(LogisticRegression(random_state=0), param_grid=parameters, cv=3) clf.fit(train_features, train_labels) # 得到最优参数 print("最优参数:", end="") print(clf.best_params_)

最优参数:{‘penalty’: ‘l2’, ‘max_iter’: 100, ‘C’: 0.1, ‘fit_intercept’: False}

lr = LogisticRegression(C = 0.1, fit_intercept=False, max_iter=100, penalty='l2', random_state=0) lr.fit(train_features, train_labels) predictions = lr.predict(test_features) correct = [1 if ((a == 1 and b == 1) or (a == 0 and b == 0)) else 0 for (a, b) in zip(predictions,test_labels)] accuracy = (sum(map(int, correct)))/966 print ('accuracy = {0}%'.format(accuracy))

accuracy = 0.8633540372670807%

分析小结 年龄与其他因素相关性

在这里插入图片描述

不同科室特征区分

在这里插入图片描述

策略提供 在这里插入图片描述For Ask120 platform:In order to make the “Ask120” platform increase the number of adoptions, so as to obtain higher profits and greater social effects. The platform can be optimized by the following measures: Utlizing the delay of patients’ responses, accurately recommend hospitals and drugs and promote online traffic transformation through prediction.Patients generally trust doctors with rich online diagnosis and treatment experience. The platform can distribute problems evenly and improve the average diagnosis and treatment rate of doctors on the platform.Patients of different ages have different characteristics when they interact. The older , the lower the acceptance is, and they prefer the number of responses to weigh the treatment plan.Different departments should improve corresponding indicators.


【本文地址】


今日新闻


推荐新闻


CopyRight 2018-2019 办公设备维修网 版权所有 豫ICP备15022753号-3