python爬取百度汉语(汉字,部首,笔划数,拼音,相关组词,基本释义) |
您所在的位置:网站首页 › 汉字部首组字 › python爬取百度汉语(汉字,部首,笔划数,拼音,相关组词,基本释义) |
文章目录
目录 前言 一、使用步骤 1.引入库 2.读入数据 3.信息爬取 3.数据保存 完整代码 前言本文根据输入word文档,依次读取word中的汉字在百度汉语进行爬取对应汉字的汉字,部首,笔划数,拼音,相关组词,基本释义并且生成excel保存所爬取的内容。 一、使用步骤 1.引入库代码如下(示例): import urllib.request as ur import urllib.parse as up from lxml import etree import os import pandas as pd from docx import Document 2.读入数据代码如下(示例): def readWordFile(file_path): doc = Document(file_path) words = [] for paragraph in doc.paragraphs: words.extend(list(paragraph.text)) return words该处传入word文档地址依次遍历 3.信息爬取代码如下(示例): def getMeaning(word): parse_json_list = {} print("\n正在爬取信息...") parse_json = {} url = 'https://hanyu.baidu.com/zici/s?from=aladdin&query=' + up.quote(word) + '&srcid=51368&wd=' + up.quote(word) html = openUrl(url).decode('utf-8', errors='ignore') html = etree.HTML(html) 部首 = html.xpath("//li[@id='radical']//span//text()") 笔划数 = html.xpath("//li[@id='stroke_count']//span//text()") 拼音 = html.xpath("//div[@id='pinyin']//span//b//text()") 相关组词 = html.xpath("//div[@class='content link-terms']//a//text()") 基本释义 = ''.join([str(e) for e in html.xpath("//div[@id='basicmean-wrapper']//p/text()")]).replace(" ","").replace("\n", "") parse_json["radical"] = 部首 parse_json["stroke_count"] = 笔划数 parse_json["related_words"] = 相关组词 parse_json["pinyin"] = 拼音 parse_json["basic_meaning"] = 基本释义 parse_json_list[url] = parse_json save(parse_json_list, word)爬取汉字,部首,笔划数,拼音,相关组词,基本释义信息 3.数据保存代码如下(示例): def save(parse_json_list, word): print("\n开始保存数据...\n") response_all = 'D:\\baiduhanyu' if not os.path.exists(response_all): os.makedirs(response_all) save_file_path = response_all + "\\" + word + ".xlsx" excel_data = [] for url, parse_json in parse_json_list.items(): row = [word] # 将Word内容作为"字"列下的数据 row.append(url) # 将URL作为"URL"列下的数据 for key, value in parse_json.items(): if key == "related_words" or key == "pinyin": row.append(' '.join(value)) else: row.append(''.join(value)) excel_data.append(row) df = pd.DataFrame(excel_data, columns=["字", "URL", "部首", "笔划数", "相关组词", "拼音", "基本释义"]) df.to_excel(save_file_path, index=False)保存数据为excel 完整代码 import urllib.request as ur import urllib.parse as up from lxml import etree import os import pandas as pd from docx import Document def openUrl(url): request = ur.Request(url) request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:35.0) Gecko/20100101 Firefox/35.0') response = ur.urlopen(request) html = response.read() return html def readWordFile(file_path): doc = Document(file_path) words = [] for paragraph in doc.paragraphs: words.extend(list(paragraph.text)) return words def getMeaning(word): parse_json_list = {} print("\n正在爬取信息...") parse_json = {} url = 'https://hanyu.baidu.com/zici/s?from=aladdin&query=' + up.quote(word) + '&srcid=51368&wd=' + up.quote(word) html = openUrl(url).decode('utf-8', errors='ignore') html = etree.HTML(html) 部首 = html.xpath("//li[@id='radical']//span//text()") 笔划数 = html.xpath("//li[@id='stroke_count']//span//text()") 拼音 = html.xpath("//div[@id='pinyin']//span//b//text()") 相关组词 = html.xpath("//div[@class='content link-terms']//a//text()") # 相关组词 = ['_'.join([word[i:i+2] for i in range(0, len(word), 2)]) for word in 相关组词] 基本释义 = ''.join([str(e) for e in html.xpath("//div[@id='basicmean-wrapper']//p/text()")]).replace(" ","").replace("\n", "") parse_json["radical"] = 部首 parse_json["stroke_count"] = 笔划数 parse_json["related_words"] = 相关组词 parse_json["pinyin"] = 拼音 parse_json["basic_meaning"] = 基本释义 parse_json_list[url] = parse_json save(parse_json_list, word) def save(parse_json_list, word): print("\n开始保存数据...\n") response_all = 'D:\\baiduhanyu' if not os.path.exists(response_all): os.makedirs(response_all) save_file_path = response_all + "\\" + word + ".xlsx" excel_data = [] for url, parse_json in parse_json_list.items(): row = [word] # 将Word内容作为"字"列下的数据 row.append(url) # 将URL作为"URL"列下的数据 for key, value in parse_json.items(): if key == "related_words" or key == "pinyin": row.append(' '.join(value)) else: row.append(''.join(value)) excel_data.append(row) df = pd.DataFrame(excel_data, columns=["字", "URL", "部首", "笔划数", "相关组词", "拼音", "基本释义"]) df.to_excel(save_file_path, index=False) if __name__ == '__main__': docx_file_path = input('请输入Word文档路径: ') doc = Document(docx_file_path) words = [] for paragraph in doc.paragraphs: words.extend(list(paragraph.text)) for word in words: getMeaning(word) |
今日新闻 |
推荐新闻 |
CopyRight 2018-2019 办公设备维修网 版权所有 豫ICP备15022753号-3 |