基于python苏宁易购商品信息爬取

您所在的位置:网站首页 爬取苏宁易购商品信息 基于python苏宁易购商品信息爬取

基于python苏宁易购商品信息爬取

2023-03-28 11:19| 来源: 网络整理| 查看: 265

本文思路来源崔庆才老师的淘宝商品爬取

-首先打开苏宁易购网站,找到搜索框以及搜索按钮接口,模拟人工操作,输入关键词,并进行点击搜索操作,进入到商品的详细页。

苏宁首页

from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC import re import time import pymongo from sn_config import * from selenium.webdriver.common.keys import Keys from pyquery import PyQuery as pq MONGO_URL = 'locallost' MOGODB = 'SUNING' browser = webdriver.PhantomJS(service_args=SERVICE_ASK) # 无可视化的浏览器,需安装插件 # browser = webdriver.Chrome() # 可视化浏览器 connect = pymongo.MongoClient(MONGO_URL) # 连接数据库 db = connect[MONGODB] def search(): # 请求苏宁易购首页 browser.get('https://www.suning.com/') # 找到输入的搜索框 _input = WebDriverWait(browser, 10).until( EC.presence_of_element_located((By.CSS_SELECTOR, '#searchKeywords')) ) # WebDriverWait为等待浏览器数据加载 # 找到搜索按钮 submit = WebDriverWait(browser, 10).until( EC.element_to_be_clickable((By.CSS_SELECTOR, "#searchSubmit"))) _input.send_keys(keyword) # 输入关键词,本文中keyword为’电器‘,读者可以自行更换。 submit.click() # 点击事件

-关于苏宁的数据加载的机制,这里说一下,苏宁商品数据加载采用的Ajax方法,这点跟淘宝有所不同,针对动态的数据加载方法,笔者在这里采取的比较懒惰的方法,先模拟人工下拉到最低端,再等待1秒让数据完全加载。商品详细页 -再获取商品信息前,先进行了页码判断,用高亮显示来判断此时的页码是否为当前页码,并找到总页数,用来后面循环以来获取所有的商品信息。

import time target = browser.find_element_by_css_selector('#bottom_pager > div > span.page-more') #总页数位置 browser.execute_script("arguments[0].scrollIntoView();", target) #执行定位操作 time.sleep(3) total = WebDriverWait(browser, 10).until( EC.presence_of_element_located((By.CSS_SELECTOR, '#bottom_pager > div > span.page-more'))) # 对找到的总页数进行正则处理,并返回int类型的页数 _total = total.text # print(_total) pattern = re.compile('SS(d+).*?') result = re.search(pattern, _total)

-处理商品信息,获取当前页所有商品。

def parse_html(): try: # 选择整个展示框 WebDriverWait(browser, 20).until( EC.presence_of_element_located((By.CSS_SELECTOR, '#product-list .item-wrap'))) # product-list html = browser.page_source doc = pq(html) # 拿到所有的item,进行迭代,拿到所有商品的数据 items = doc('#product-list .item-wrap').items() for item in items: proucts = { 'price': item.find('.def-price').text().replace('n', ' '), 'description': item.find('.title-selling-point').text().strip().replace('n', ' '), 'shop': item.find('.store-stock').text(), } if db[MONGO_TABLE].insert(proucts): print('正在保存', proucts.get('description')) except TimeoutError: parse_html()

-进行翻页操作,这里的实现方法是通过跳转框输入页数,点击确定,再通过高亮来判断是否跳入到了输入的页数。

def next_page(page): print('正在翻页', page) try: # 找到页数的输入框 time.sleep(1) inputs = WebDriverWait(browser, 20).until( EC.presence_of_element_located((By.CSS_SELECTOR, '#bottomPage'))) time.sleep(1) inputs.clear() # 找到确定的按钮 submit = WebDriverWait(browser, 10).until( EC.element_to_be_clickable( (By.CSS_SELECTOR, "#bottom_pager > div > a.page-more.ensure"))) inputs.send_keys(page) submit.send_keys(Keys.ENTER) target = browser.find_element_by_css_selector('#bottom_pager > div > span.page-more') browser.execute_script("arguments[0].scrollIntoView();", target) # 将页面下拉至底部休息3秒等待数据加载 time.sleep(3) # 进行判定:高亮下的页数是否和输入框的一致 WebDriverWait(browser, 10).until( EC.element_to_be_clickable( (By.CSS_SELECTOR, '#bottom_pager > div > a.cur')), str(page)) parse_html() except Exception as e: print(e.args) next_page(page)

-定义一个main函数,将上面获取到的总页数作为下一页循环的终止条件,并在执行完毕后关闭浏览器。

def main(): try: total = search() # 得到的总页数 for i in range(2, total + 1): next_page(i) time.sleep(2) except Exception as e: print(e.args) finally: browser.close()

-下面给出完整代码

from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC import re import time import pymongo from sn_config import * from selenium.webdriver.common.keys import Keys from pyquery import PyQuery as pq MONGO_URL = 'locallost' MOGODB = 'SUNING' browser = webdriver.PhantomJS(service_args=SERVICE_ASK) # 无可视化的浏览器,需安装插件 # browser = webdriver.Chrome() # 可视化浏览器 connect = pymongo.MongoClient(MONGO_URL) # 连接数据库 db = connect[MONGODB] def search(): # 请求苏宁易购首页 browser.get('https://www.suning.com/') # 找到输入的搜索框 _input = WebDriverWait(browser, 10).until( EC.presence_of_element_located((By.CSS_SELECTOR, '#searchKeywords')) ) # 找到搜索按钮 submit = WebDriverWait(browser, 10).until( EC.element_to_be_clickable((By.CSS_SELECTOR, "#searchSubmit"))) _input.send_keys(keyword) submit.click() # 找到总页数 target = browser.find_element_by_css_selector('#bottom_pager > div > span.page-more') browser.execute_script("arguments[0].scrollIntoView();", target) time.sleep(3) total = WebDriverWait(browser, 10).until( EC.presence_of_element_located((By.CSS_SELECTOR, '#bottom_pager > div > span.page-more'))) # 对找到的总页数进行正则处理,并返回int类型的页数 _total = total.text # print(_total) pattern = re.compile('SS(d+).*?') result = re.search(pattern, _total) parse_html() return int(result.group(0)[1:]) def next_page(page): print('正在翻页', page) try: # 找到页数的输入框 time.sleep(1) inputs = WebDriverWait(browser, 20).until( EC.presence_of_element_located((By.CSS_SELECTOR, '#bottomPage'))) time.sleep(1) inputs.clear() # 找到确定的按钮 submit = WebDriverWait(browser, 10).until( EC.element_to_be_clickable( (By.CSS_SELECTOR, "#bottom_pager > div > a.page-more.ensure"))) inputs.send_keys(page) submit.send_keys(Keys.ENTER) target = browser.find_element_by_css_selector('#bottom_pager > div > span.page-more') browser.execute_script("arguments[0].scrollIntoView();", target) # 将页面下拉至底部休息3秒等待数据加载 time.sleep(3) # 进行判定:高亮下的页数是否和输入框的一致 WebDriverWait(browser, 10).until( EC.element_to_be_clickable( (By.CSS_SELECTOR, '#bottom_pager > div > a.cur')), str(page)) parse_html() except Exception as e: print(e.args) next_page(page) def parse_html(): try: # 选择整个展示框 WebDriverWait(browser, 20).until( EC.presence_of_element_located((By.CSS_SELECTOR, '#product-list .item-wrap'))) # product-list html = browser.page_source doc = pq(html) # 拿到所有的item,进行迭代,拿到所有商品的数据 items = doc('#product-list .item-wrap').items() for item in items: proucts = { 'price': item.find('.def-price').text().replace('n', ' '), 'description': item.find('.title-selling-point').text().strip().replace('n', ' '), 'shop': item.find('.store-stock').text(), } if db[MONGO_TABLE].insert(proucts): print('正在保存', proucts.get('description')) except TimeoutError: parse_html() def main(): try: total = search() # 得到的总页数 for i in range(2, total + 1): next_page(i) time.sleep(2) except Exception as e: print(e.args) finally: browser.close() if __name__ == '__main__': main()


【本文地址】


今日新闻


推荐新闻


CopyRight 2018-2019 办公设备维修网 版权所有 豫ICP备15022753号-3