Python的商品销量预测系统,该怎么导入本地的数据集

您所在的位置:网站首页 python做销售额预测 Python的商品销量预测系统,该怎么导入本地的数据集

Python的商品销量预测系统,该怎么导入本地的数据集

#Python的商品销量预测系统,该怎么导入本地的数据集| 来源: 网络整理| 查看: 265

#%% # This Python 3 environment comes with many helpful analytics libraries installed # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python # For example, here's several helpful packages to load import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) # Input data files are available in the read-only "../input/" directory # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory import os for dirname, _, filenames in os.walk('/kaggle/input'): for filename in filenames: print(os.path.join(dirname, filename)) # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session #%% import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns sns.set() #%% train=pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv') shops=pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/shops.csv') items=pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/items.csv') cat=pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/item_categories.csv') train=train.sample(frac=0.1,random_state=42) #%% shop_id_map = {11: 10, 0: 57, 1: 58, 40: 39} train.loc[train['shop_id'].isin(shop_id_map), 'shop_id'] = train.loc[train['shop_id'].isin(shop_id_map), 'shop_id'].map(shop_id_map) train.loc[train['shop_id'].isin(shop_id_map), 'shop_id'] #%% shops['shop_city'] = shops['shop_name'].map(lambda x:x.split(' ')[0].strip('!')) shop_types = ['ТЦ', 'ТРК', 'ТРЦ', 'ТК', 'МТРЦ'] shops['shop_type'] = shops['shop_name'].map(lambda x:x.split(' ')[1] if x.split(' ')[1] in shop_types else 'Others') shops.loc[shops['shop_id'].isin([12, 56]), ['shop_city', 'shop_type']] = 'Online' # 12和56号是网上商店 shops.head(13) #%% shop_city_map = dict([(v,k) for k, v in enumerate(shops['shop_city'].unique())]) shop_type_map = dict([(v,k) for k, v in enumerate(shops['shop_type'].unique())]) shops['shop_city_code'] = shops['shop_city'].map(shop_city_map) shops['shop_type_code'] = shops['shop_type'].map(shop_type_map) shops.head(7) #%% items['item_name'] = items['item_name'].map(lambda x: ''.join(x.split(' '))) # 删除空格 duplicated_item_name = items[items['item_name'].duplicated()] duplicated_item_name_rec = items[items['item_name'].isin(duplicated_item_name['item_name'])] #%% old_id = duplicated_item_name_rec['item_id'].values[::2] new_id = duplicated_item_name_rec['item_id'].values[1::2] old_new_map = dict(zip(old_id, new_id)) train.loc[train['item_id'].isin(old_id), 'item_id'] = train.loc[train['item_id'].isin(old_id), 'item_id'].map(old_new_map) train[train['item_id'].isin(old_id)] #%% cat['item_type'] = cat['item_category_name'].map(lambda x: 'Игры' if x.find('Игры ')>0 else x.split(' -')[0].strip('\"')) cat.iloc[[32,-3, -2], -1] = ['Карты оплаты', 'Чистые носители', 'Чистые носители' ] item_type_map = dict([(v,k) for k, v in enumerate(cat['item_type'].unique())]) cat['item_type_code'] = cat['item_type'].map(item_type_map) cat.head() #%% cat['sub_type'] = cat['item_category_name'].map(lambda x: x.split('-',1)[-1]) sub_type_map = dict([(v,k) for k, v in enumerate(cat['sub_type'].unique())]) cat['sub_type_code'] = cat['sub_type'].map(sub_type_map) #%% items = items.merge(cat[['item_category_id', 'item_type_code', 'sub_type_code']], on='item_category_id', how='left') items.head() #%% import gc del cat gc.collect() #%% sns.jointplot('item_cnt_day', 'item_price', train, kind='scatter') #%% train_filtered = train[(train['item_cnt_day'] < 800) & (train['item_price'] < 70000)].copy() #%% outer = train[(train['item_cnt_day'] > 400) | (train['item_price'] > 40000)] #%% outer_set = train_filtered[train_filtered['item_id'].isin(outer['item_id'].unique())].groupby('item_id') fig, ax = plt.subplots(1,1,figsize=(10, 10)) colors = sns.color_palette() + sns.color_palette('bright') # 使用调色板。默认颜色只有10来种,会重复使用,不便于观察 i = 1 for name, group in outer_set: ax.plot(group['item_cnt_day'], group['item_price'], marker='o', linestyle='', ms=12, label=name, c=colors[i]) i += 1 ax.legend() plt.show() #%% filtered = train[(train['item_cnt_day'] < 400) & (train['item_price'] < 45000)].copy() filtered.head() #%% filtered.drop(index=filtered[filtered['item_id'].isin([7238, 14173])].index, inplace=True) #%% del train, train_filtered gc.collect() #%% filtered.loc[filtered['item_price'] 11].fillna(0) del martix gc.collect() #%% for col in train_set.columns: if col.find('code') >= 0: train_set[col] = train_set[col].astype(np.int8) elif train_set[col].dtype == 'float64': train_set[col] = train_set[col].astype(np.float32) elif train_set[col].dtype == 'int64': train_set[col] = train_set[col].astype(np.int16) train_set['item_type_code'] = train_set['item_type_code'].astype('category') train_set['sub_type_code'] = train_set['sub_type_code'].astype('category') train_set.info() #%% import lightgbm as lgb X_train = train_set[train_set['date_block_num'] < 32].drop(columns=['item_cnt_month']) # 训练集的样本特征 Y_train = train_set[train_set['date_block_num'] < 32]['item_cnt_month'] # 训练集的样本标签 X_validate = train_set[train_set['date_block_num'] == 32].drop(columns=['item_cnt_month']) # 校对集 Y_validate = train_set[train_set['date_block_num'] == 32]['item_cnt_month'] X_test = train_set[train_set['date_block_num'] == 33].drop(columns=['item_cnt_month']) # 测试集 #%% Y_true=train_set[train_set['date_block_num'] == 33]['item_cnt_month'] #%% X_test[0:50] #%% del train_set gc.collect() #%% train_data = lgb.Dataset(data=X_train, label=Y_train) validate_data = lgb.Dataset(data=X_validate, label=Y_validate) #%% import time ts = time.time() params = {"objective" : "regression", "metric" : "rmse", 'n_estimators':10000, 'early_stopping_rounds':50, "num_leaves" : 200, "learning_rate" : 0.01, "bagging_fraction" : 0.9, "feature_fraction" : 0.3, "bagging_seed" : 0} print('Start....', ts) lgb_model = lgb.train(params, train_data, valid_sets=[train_data, validate_data], verbose_eval=1000) print('End...', time.time() - ts) #%% lgb.plot_importance(lgb_model, max_num_features=40, figsize=(12, 8)) plt.title("Featurertances") plt.show() #%% X_test.shape #%% Y_true.shape #%% Y_test = lgb_model.predict(X_test).clip(0, 20) #%% error = Y_test - Y_true rmse = (error**2).mean()**0.5 rmse #%% X_test.head(50) #%% Y_test[0:50] #%% Y_true[0:50] #%%

这是源代码

另外这段的

错误是因为什么



【本文地址】


今日新闻


推荐新闻


CopyRight 2018-2019 办公设备维修网 版权所有 豫ICP备15022753号-3