基于python苏宁易购商品信息爬取 |
您所在的位置:网站首页 › 爬取苏宁易购商品信息 › 基于python苏宁易购商品信息爬取 |
本文思路来源崔庆才老师的淘宝商品爬取 -首先打开苏宁易购网站,找到搜索框以及搜索按钮接口,模拟人工操作,输入关键词,并进行点击搜索操作,进入到商品的详细页。 -关于苏宁的数据加载的机制,这里说一下,苏宁商品数据加载采用的Ajax方法,这点跟淘宝有所不同,针对动态的数据加载方法,笔者在这里采取的比较懒惰的方法,先模拟人工下拉到最低端,再等待1秒让数据完全加载。 -处理商品信息,获取当前页所有商品。 def parse_html(): try: # 选择整个展示框 WebDriverWait(browser, 20).until( EC.presence_of_element_located((By.CSS_SELECTOR, '#product-list .item-wrap'))) # product-list html = browser.page_source doc = pq(html) # 拿到所有的item,进行迭代,拿到所有商品的数据 items = doc('#product-list .item-wrap').items() for item in items: proucts = { 'price': item.find('.def-price').text().replace('n', ' '), 'description': item.find('.title-selling-point').text().strip().replace('n', ' '), 'shop': item.find('.store-stock').text(), } if db[MONGO_TABLE].insert(proucts): print('正在保存', proucts.get('description')) except TimeoutError: parse_html()-进行翻页操作,这里的实现方法是通过跳转框输入页数,点击确定,再通过高亮来判断是否跳入到了输入的页数。 def next_page(page): print('正在翻页', page) try: # 找到页数的输入框 time.sleep(1) inputs = WebDriverWait(browser, 20).until( EC.presence_of_element_located((By.CSS_SELECTOR, '#bottomPage'))) time.sleep(1) inputs.clear() # 找到确定的按钮 submit = WebDriverWait(browser, 10).until( EC.element_to_be_clickable( (By.CSS_SELECTOR, "#bottom_pager > div > a.page-more.ensure"))) inputs.send_keys(page) submit.send_keys(Keys.ENTER) target = browser.find_element_by_css_selector('#bottom_pager > div > span.page-more') browser.execute_script("arguments[0].scrollIntoView();", target) # 将页面下拉至底部休息3秒等待数据加载 time.sleep(3) # 进行判定:高亮下的页数是否和输入框的一致 WebDriverWait(browser, 10).until( EC.element_to_be_clickable( (By.CSS_SELECTOR, '#bottom_pager > div > a.cur')), str(page)) parse_html() except Exception as e: print(e.args) next_page(page)-定义一个main函数,将上面获取到的总页数作为下一页循环的终止条件,并在执行完毕后关闭浏览器。 def main(): try: total = search() # 得到的总页数 for i in range(2, total + 1): next_page(i) time.sleep(2) except Exception as e: print(e.args) finally: browser.close()-下面给出完整代码 from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC import re import time import pymongo from sn_config import * from selenium.webdriver.common.keys import Keys from pyquery import PyQuery as pq MONGO_URL = 'locallost' MOGODB = 'SUNING' browser = webdriver.PhantomJS(service_args=SERVICE_ASK) # 无可视化的浏览器,需安装插件 # browser = webdriver.Chrome() # 可视化浏览器 connect = pymongo.MongoClient(MONGO_URL) # 连接数据库 db = connect[MONGODB] def search(): # 请求苏宁易购首页 browser.get('https://www.suning.com/') # 找到输入的搜索框 _input = WebDriverWait(browser, 10).until( EC.presence_of_element_located((By.CSS_SELECTOR, '#searchKeywords')) ) # 找到搜索按钮 submit = WebDriverWait(browser, 10).until( EC.element_to_be_clickable((By.CSS_SELECTOR, "#searchSubmit"))) _input.send_keys(keyword) submit.click() # 找到总页数 target = browser.find_element_by_css_selector('#bottom_pager > div > span.page-more') browser.execute_script("arguments[0].scrollIntoView();", target) time.sleep(3) total = WebDriverWait(browser, 10).until( EC.presence_of_element_located((By.CSS_SELECTOR, '#bottom_pager > div > span.page-more'))) # 对找到的总页数进行正则处理,并返回int类型的页数 _total = total.text # print(_total) pattern = re.compile('SS(d+).*?') result = re.search(pattern, _total) parse_html() return int(result.group(0)[1:]) def next_page(page): print('正在翻页', page) try: # 找到页数的输入框 time.sleep(1) inputs = WebDriverWait(browser, 20).until( EC.presence_of_element_located((By.CSS_SELECTOR, '#bottomPage'))) time.sleep(1) inputs.clear() # 找到确定的按钮 submit = WebDriverWait(browser, 10).until( EC.element_to_be_clickable( (By.CSS_SELECTOR, "#bottom_pager > div > a.page-more.ensure"))) inputs.send_keys(page) submit.send_keys(Keys.ENTER) target = browser.find_element_by_css_selector('#bottom_pager > div > span.page-more') browser.execute_script("arguments[0].scrollIntoView();", target) # 将页面下拉至底部休息3秒等待数据加载 time.sleep(3) # 进行判定:高亮下的页数是否和输入框的一致 WebDriverWait(browser, 10).until( EC.element_to_be_clickable( (By.CSS_SELECTOR, '#bottom_pager > div > a.cur')), str(page)) parse_html() except Exception as e: print(e.args) next_page(page) def parse_html(): try: # 选择整个展示框 WebDriverWait(browser, 20).until( EC.presence_of_element_located((By.CSS_SELECTOR, '#product-list .item-wrap'))) # product-list html = browser.page_source doc = pq(html) # 拿到所有的item,进行迭代,拿到所有商品的数据 items = doc('#product-list .item-wrap').items() for item in items: proucts = { 'price': item.find('.def-price').text().replace('n', ' '), 'description': item.find('.title-selling-point').text().strip().replace('n', ' '), 'shop': item.find('.store-stock').text(), } if db[MONGO_TABLE].insert(proucts): print('正在保存', proucts.get('description')) except TimeoutError: parse_html() def main(): try: total = search() # 得到的总页数 for i in range(2, total + 1): next_page(i) time.sleep(2) except Exception as e: print(e.args) finally: browser.close() if __name__ == '__main__': main() |
CopyRight 2018-2019 办公设备维修网 版权所有 豫ICP备15022753号-3 |