Python爬虫实战+数据分析+数据可视化(猫眼电影)

您所在的位置:网站首页 电影数据分析可视化软件 Python爬虫实战+数据分析+数据可视化(猫眼电影)

Python爬虫实战+数据分析+数据可视化(猫眼电影)

2024-07-13 08:34| 来源: 网络整理| 查看: 265

一、爬虫部分

爬虫说明: 1、本爬虫是以面向对象的方式进行代码架构的 2、本爬虫爬取的数据存入到MongoDB数据库中 3、爬虫代码中有详细注释

代码展示

import re import time from pymongo import MongoClient import requests from lxml import html from urllib import parse class CatMovie(): def __init__(self): self.start_url = 'https://maoyan.com/films?showType=3&offset=0' self.url_temp = 'https://maoyan.com/films?showType=3&offset={}' self.detail_url = 'https://maoyan.com/films/{}' # 构造响应头 self.headers = { "Cookie": "__mta=143397386.1607405956154.1608533524873.1608533569928.76; _lxsdk_cuid=174f6b873b49b-005ed8da7476a-3d634f03-144000-174f6b873b5c8; uuid_n_v=v1; __utma=17099173.1780976830.1607406113.1607406113.1607406113.1; __utmz=17099173.1607406113.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); recentCis=52%3D73%3D1; _lxsdk=92DE8D903FA311EB97145540D12763BA74A99EC69EF74E288E03A6373ED78378; _lx_utm=utm_source%3Dmeituanweb; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1608533269,1608533300,1608533544,1608533623; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1608533623; _lxsdk_s=176840dd994-603-17f-1df%7C%7C34; __mta=143397386.1607405956154.1608533569928.1608533622616.77; uuid=70554980435911EB91B6516866DCC34951FCE88748C84B79A0630808E1889048; _csrf=be6825573a1247a5dcf2ed5a6100bacaacccd21643c45ee79a3a7a28c1bb32e9; lt=3dN05zd6hwM_WEa3scYBnu5qcEoAAAAARgwAAMUreBNzDKR9eCuGuYWtOPWt5ULO65alj1dffuIQJisgN0lrWp0kJkyABp6Ly8cJ2A; lt.sig=y5Xz3WT9ooI2TpIM7pzKU9CROfo", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36" } # 猫眼电影的自定义字体映射 self.font_dict = {'unif4ef': '6', 'unif848': '3', 'unif88a': '7', 'unie7a1': '9', 'unie343':'1','unie137':'8','unif489':'0','unie5e2':'4','unif19b':'2','unie8cd':'5'} self.client = MongoClient() self.collection = self.client['test']['cat_movie'] # 构造请求详情页url def get_url_list(self): total_num = 2000 page = total_num//30+1 return [self.url_temp.format(i*30) for i in range(0,page+1)] # 解析请求并解析url地址 def parse_url(self,url): rest = requests.get(url,headers=self.headers) return rest.content.decode() # 解析并获取列表页数据 def get_content_list(self,html_str): movie_ids = re.findall(r'href="/films/(.*?)" target',html_str) item_list = [] for i in movie_ids[::3]: item = {} detail_url = self.detail_url.format(i) # 获取到每一个详情数据的唯一标志在通过urljoin构造详情页url item['detail_url'] = parse.urljoin(self.start_url,detail_url) item = self.parse_detail(item['detail_url'],item) print(item) item_list.append(item) return item_list # 解析并获取详情页数据 def parse_detail(self,url,item): time.sleep(0.1) rest = requests.get(url,headers=self.headers) # 先替换掉页面中加密字体的&#x 通过用**包裹方便后续锁定 html_str = re.sub(r'&#x(\w+)?;',r'*uni\1*',rest.content.decode()) html_str = html.etree.HTML(html_str) # 获取信息多采用三目运算符的方式 防止因获取的内容不存在而报异常 # 通过三目运算符进行多重判断可以增加程序的稳定性 movie_name = html_str.xpath('//div[@class="movie-brief-container"]/h1/text()') item['movie_name'] = movie_name[0] if len(movie_name)>0 else None movie_type = html_str.xpath('//div[@class="movie-brief-container"]/ul/li[1]/a/text()') movie_type = movie_type if len(movie_type)>0 else None if movie_type is not None: item['movie_type'] = '·'.join([i.strip() for i in movie_type]) else: item['movie_type'] = '类型未知' area_time = html_str.xpath('//div[@class="movie-brief-container"]/ul/li[2]/text()') area_time = area_time[0] if len(area_time)>0 else None if area_time is not None: area_time = area_time.split('/') item['movie_area'] = area_time[0].strip() if len(area_time)>0 else '上映国家未知' item['movie_duration'] = area_time[1].strip() if len(area_time)>1 else '电影时长未知' else: item['movie_area'] = '上映国家未知' item['movie_duration'] = '电影时长未知' movie_publish = html_str.xpath('//div[@class="movie-brief-container"]/ul/li[3]/text()') movie_publish = movie_publish[0] if len(movie_publish)>0 else None if movie_publish is not None: item['movie_publish'] = re.findall(r'(\d+-\d+-\d+)',movie_publish) item['movie_publish'] = item['movie_publish'][0] if len(item['movie_publish'])>0 else movie_publish else: item['movie_publish'] = '上映时间未知' movie_score = html_str.xpath('//div[@class="movie-index-content score normal-score"]/span/span/text()') movie_score = movie_score[0] if len(movie_score)>0 else None if movie_score is not None: item['movie_score'] = re.sub(r'(\*[a-z0-9]+?\*)',lambda x:self.font_dict[x.group(1).strip('*')],movie_score) else: item['movie_score'] = '电影评分未知' movie_comments = html_str.xpath('//span[@class="score-num"]/span/text()') movie_comments = movie_comments[0] if len(movie_comments)>0 else None if movie_comments is not None: item['movie_comments'] = re.sub(r'(\*[a-z0-9]+?\*)',lambda x:self.font_dict[x.group(1).strip('*')],movie_comments) else: item['movie_comments'] = '评论人数未知' movie_booking = html_str.xpath('//div[@class="movie-index-content box"]/span[1]/text()') movie_booking = movie_booking[0] if len(movie_booking)>0 else None if movie_booking is not None: unit = html_str.xpath('//div[@class="movie-index-content box"]/span[2]/text()') unit = unit[0] if len(unit) > 0 else '' item['movie_booking'] = re.sub(r'(\*[a-z0-9]+?\*)', lambda x: self.font_dict[x.group(1).strip('*')],movie_booking) + unit else: item['movie_booking'] = '电影票房未知' movie_director = html_str.xpath('//div[@class="celebrity-container"]//div[1]//div[@class="info"]//a/text()') movie_director = movie_director[0] if len(movie_director)>0 else None if movie_director is not None: item['movie_director'] = movie_director.strip() else: item['movie_director'] = '导演未知' return item # 保存数据 def save(self,content_list): for i in content_list: self.collection.insert(i) # 程序主方法 def run(self): url_list = self.get_url_list() for i in url_list: time.sleep(0.5) html_str = self.parse_url(i) item_list = self.get_content_list(html_str) # self.save(item_list) if __name__ == '__main__': movie = CatMovie() movie.run() 二、数据分析和数据可视化部分

数据分析和数据可视化说明: 1、本博客通过Flask框架来进行数据分析和数据可视化 2、项目的架构图为 在这里插入图片描述

代码展示

数据分析代码展示(analysis.py) import re from pymongo import MongoClient import pandas as pd import numpy as np import pymysql # 不同年份上映的电影数量 def movie_date_publish_count(df): grouped = df.groupby('movie_publish_year')['movie_type'].count().reset_index() data = grouped.to_dict(orient='records') # 将数据转换成数组嵌套数组的格式 data = [[str(i['movie_publish_year']),i['movie_type']] for i in data] return data # 不同地区的电影上映数量最多的前十个地区 def movie_country_publish_top10(df): # 原数据中可能每个电影上映在多个地区 且不同地区之间使用逗号分隔 因此将movie_area列数据以逗号进行分隔变成列表 series_country = df['movie_area'].str.split(',').tolist() # 利用set函数的特性 当数据出现重复时 只保留一个数据 list_country = set([j for i in series_country for j in i]) # 创建0矩阵统计不同地区电影上映的数量 zero_list = pd.DataFrame(np.zeros((len(series_country),len(list_country))),columns=list_country) for i in range(len(zero_list)): zero_list.loc[i][series_country[i]] = 1 # 使用聚合函数对不同地区的电影进行统计 country_movie_counts = zero_list.sum().astype(np.int) country_movie_counts = country_movie_counts.reset_index() country_movie_counts.columns = ['movie_area','count'] # 对数据进行排序并取出数量最多的前十个地区 country_movie_counts = country_movie_counts.sort_values(by='count',ascending=False)[:10] data = [[i['movie_area'],i['count']] for i in country_movie_counts.to_dict(orient='records')] return data # 统计票房前十的电影 def movie_booking_top10(df): # 按照票房数量进行排序并取出前十的数据 df = df.sort_values(by = 'movie_booking',ascending=False) movie_name_to_booking = df[['movie_name','movie_booking']][:10] data = [[i['movie_name'],i['movie_booking']] for i in movie_name_to_booking.to_dict(orient='records')] print(data) return data # 统计评论前十的电影 def movie_comment_top10(df): # 按照评论数量进行排序并取出前十的数据 df = df.sort_values(by = 'movie_comments',ascending=False) movie_name_to_booking = df[['movie_name','movie_comments']][:10] data = [[i['movie_name'],i['movie_comments']] for i in movie_name_to_booking.to_dict(orient='records')] print(data) return data # 统计不同评分区间的电影数量 def movie_score_count(df): # 根据不同区间划分电影评分数据 区间分别为 8.0 三个区间 grouped1 = df[df['movie_score']=7.0) & (df['movie_score']8]['movie_score'] movie_score_to_count = [{'movie_score':'


【本文地址】


今日新闻


推荐新闻


CopyRight 2018-2019 办公设备维修网 版权所有 豫ICP备15022753号-3