#%%
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
#%%
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
#%%
train=pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv')
shops=pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/shops.csv')
items=pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/items.csv')
cat=pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/item_categories.csv')
train=train.sample(frac=0.1,random_state=42)
#%%
shop_id_map = {11: 10, 0: 57, 1: 58, 40: 39}
train.loc[train['shop_id'].isin(shop_id_map), 'shop_id'] = train.loc[train['shop_id'].isin(shop_id_map), 'shop_id'].map(shop_id_map)
train.loc[train['shop_id'].isin(shop_id_map), 'shop_id']
#%%
shops['shop_city'] = shops['shop_name'].map(lambda x:x.split(' ')[0].strip('!'))
shop_types = ['ТЦ', 'ТРК', 'ТРЦ', 'ТК', 'МТРЦ']
shops['shop_type'] = shops['shop_name'].map(lambda x:x.split(' ')[1] if x.split(' ')[1] in shop_types else 'Others')
shops.loc[shops['shop_id'].isin([12, 56]), ['shop_city', 'shop_type']] = 'Online' # 12和56号是网上商店
shops.head(13)
#%%
shop_city_map = dict([(v,k) for k, v in enumerate(shops['shop_city'].unique())])
shop_type_map = dict([(v,k) for k, v in enumerate(shops['shop_type'].unique())])
shops['shop_city_code'] = shops['shop_city'].map(shop_city_map)
shops['shop_type_code'] = shops['shop_type'].map(shop_type_map)
shops.head(7)
#%%
items['item_name'] = items['item_name'].map(lambda x: ''.join(x.split(' '))) # 删除空格
duplicated_item_name = items[items['item_name'].duplicated()]
duplicated_item_name_rec = items[items['item_name'].isin(duplicated_item_name['item_name'])]
#%%
old_id = duplicated_item_name_rec['item_id'].values[::2]
new_id = duplicated_item_name_rec['item_id'].values[1::2]
old_new_map = dict(zip(old_id, new_id))
train.loc[train['item_id'].isin(old_id), 'item_id'] = train.loc[train['item_id'].isin(old_id), 'item_id'].map(old_new_map)
train[train['item_id'].isin(old_id)]
#%%
cat['item_type'] = cat['item_category_name'].map(lambda x: 'Игры' if x.find('Игры ')>0 else x.split(' -')[0].strip('\"'))
cat.iloc[[32,-3, -2], -1] = ['Карты оплаты', 'Чистые носители', 'Чистые носители' ]
item_type_map = dict([(v,k) for k, v in enumerate(cat['item_type'].unique())])
cat['item_type_code'] = cat['item_type'].map(item_type_map)
cat.head()
#%%
cat['sub_type'] = cat['item_category_name'].map(lambda x: x.split('-',1)[-1])
sub_type_map = dict([(v,k) for k, v in enumerate(cat['sub_type'].unique())])
cat['sub_type_code'] = cat['sub_type'].map(sub_type_map)
#%%
items = items.merge(cat[['item_category_id', 'item_type_code', 'sub_type_code']], on='item_category_id', how='left')
items.head()
#%%
import gc
del cat
gc.collect()
#%%
sns.jointplot('item_cnt_day', 'item_price', train, kind='scatter')
#%%
train_filtered = train[(train['item_cnt_day'] < 800) & (train['item_price'] < 70000)].copy()
#%%
outer = train[(train['item_cnt_day'] > 400) | (train['item_price'] > 40000)]
#%%
outer_set = train_filtered[train_filtered['item_id'].isin(outer['item_id'].unique())].groupby('item_id')
fig, ax = plt.subplots(1,1,figsize=(10, 10))
colors = sns.color_palette() + sns.color_palette('bright') # 使用调色板。默认颜色只有10来种,会重复使用,不便于观察
i = 1
for name, group in outer_set:
ax.plot(group['item_cnt_day'], group['item_price'], marker='o', linestyle='', ms=12, label=name, c=colors[i])
i += 1
ax.legend()
plt.show()
#%%
filtered = train[(train['item_cnt_day'] < 400) & (train['item_price'] < 45000)].copy()
filtered.head()
#%%
filtered.drop(index=filtered[filtered['item_id'].isin([7238, 14173])].index, inplace=True)
#%%
del train, train_filtered
gc.collect()
#%%
filtered.loc[filtered['item_price'] 11].fillna(0)
del martix
gc.collect()
#%%
for col in train_set.columns:
if col.find('code') >= 0:
train_set[col] = train_set[col].astype(np.int8)
elif train_set[col].dtype == 'float64':
train_set[col] = train_set[col].astype(np.float32)
elif train_set[col].dtype == 'int64':
train_set[col] = train_set[col].astype(np.int16)
train_set['item_type_code'] = train_set['item_type_code'].astype('category')
train_set['sub_type_code'] = train_set['sub_type_code'].astype('category')
train_set.info()
#%%
import lightgbm as lgb
X_train = train_set[train_set['date_block_num'] < 32].drop(columns=['item_cnt_month']) # 训练集的样本特征
Y_train = train_set[train_set['date_block_num'] < 32]['item_cnt_month'] # 训练集的样本标签
X_validate = train_set[train_set['date_block_num'] == 32].drop(columns=['item_cnt_month']) # 校对集
Y_validate = train_set[train_set['date_block_num'] == 32]['item_cnt_month']
X_test = train_set[train_set['date_block_num'] == 33].drop(columns=['item_cnt_month']) # 测试集
#%%
Y_true=train_set[train_set['date_block_num'] == 33]['item_cnt_month']
#%%
X_test[0:50]
#%%
del train_set
gc.collect()
#%%
train_data = lgb.Dataset(data=X_train, label=Y_train)
validate_data = lgb.Dataset(data=X_validate, label=Y_validate)
#%%
import time
ts = time.time()
params = {"objective" : "regression", "metric" : "rmse", 'n_estimators':10000, 'early_stopping_rounds':50,
"num_leaves" : 200, "learning_rate" : 0.01, "bagging_fraction" : 0.9,
"feature_fraction" : 0.3, "bagging_seed" : 0}
print('Start....', ts)
lgb_model = lgb.train(params, train_data, valid_sets=[train_data, validate_data], verbose_eval=1000)
print('End...', time.time() - ts)
#%%
lgb.plot_importance(lgb_model, max_num_features=40, figsize=(12, 8))
plt.title("Featurertances")
plt.show()
#%%
X_test.shape
#%%
Y_true.shape
#%%
Y_test = lgb_model.predict(X_test).clip(0, 20)
#%%
error = Y_test - Y_true
rmse = (error**2).mean()**0.5
rmse
#%%
X_test.head(50)
#%%
Y_test[0:50]
#%%
Y_true[0:50]
#%%
这是源代码
另外这段的
错误是因为什么
|