锅炉信息网 > 锅炉知识 > 锅炉学习

工业蒸汽量预测2

发布时间:

1.数据包引入import matplotlib.pyplot as pltnimport seaborn as snsnimport pandas as pdnimport numpy as npnfrom scipy impo

1.数据包引入

import matplotlib.pyplot as pltnimport seaborn as snsnimport pandas as pdnimport numpy as npnfrom scipy import statsnfrom sklearn.model_selection import train_test_splitnfrom sklearn.model_selection import GridSearchCV, RepeatedKFoldnfrom sklearn.model_selection import cross_val_score, cross_val_predict, KFoldnfrom sklearn.metrics import make_scorer, mean_squared_errornfrom sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNetnfrom sklearn.svm import LinearSVR, SVRnfrom sklearn.neighbors import KNeighborsRegressornfrom sklearn.ensemble import RandomForestRegressornfrom sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressornfrom xgboost import XGBRegressornfrom sklearn.preprocessing import PolynomialFeatures, MinMaxScaler, StandardScaler

2.数据载入

pd.set_option("display.max_rows", None) # None时不限制数量npd.set_option("display.max_columns", None)npd.set_option("display.width", 1000)nplt.rcParams["axes.unicode_minus"] = Falsensns.set_style("white", {"font.sans-serif": ["simhei", "Arial"]}) # 解决中文不能显示问题nnndata_train = pd.read_csv("./zhengqi_train.txt", sep="t", encoding="utf-8")ndata_test = pd.read_csv("./zhengqi_test.txt", sep="t", encoding="utf-8")ndata_train["oringin"] = "train"ndata_test["oringin"] = "test"ndata_all = pd.concat([data_train, data_test], axis=0, ignore_index=True)n# print(data_all.head())n# print(data_all.tail())

3.数据探查

3.1训练集和测试集数据分布

for col in data_all.columns[0:-2]:n g = sns.kdeplot(data_all[col][(data_all["oringin"] == "train")], color="Red", shade=True)n g = sns.kdeplot(data_all[col][(data_all["oringin"] == "test")], color="Blue", shade=True, ax=g)n g.set_xlabel(col)n g.set_ylabel("Frequency")n g.legend(["train", "test"])n plt.show()

删除特征V5,V9,V11,V17,V22,V28

data_all.drop(["V5", "V9", "V11", "V17", "V22", "V28"], axis=1, inplace=True)

再看下数据分布情况:

data_train1 = data_all[data_all["oringin"] == "train"].drop("oringin", axis=1)nfcols = 2nfrows = len(data_train1.columns)nplt.figure(figsize=(5 * fcols, 4 * frows))ni = 0nfor col in data_train1.columns:n i += 1n ax = plt.subplot(frows, fcols, i)n sns.regplot(n x=col,n y="target",n data=data_train,n ax=ax,n scatter_kws={"marker": ".", "s": 3, "alpha": 0.3},n line_kws={"color": "k"},n )n plt.xlabel(col)n plt.ylabel("target")n i += 1n ax = plt.subplot(frows, fcols, i)n sns.distplot(data_train[col].dropna(), fit=stats.norm)n plt.xlabel(col)nplt.show()

# 找出相关程度nplt.figure(figsize=(20, 16)) # 指定绘图对象宽度和高度ncolnm = data_train1.columns.tolist() # 列表头nmcorr = data_train1[colnm].corr(method="spearman") # 相关系数矩阵,即给出了任意两个变量之间的相关系数nmask = np.zeros_like(mcorr, dtype=np.bool) # 构造与mcorr同维数矩阵 为bool型nmask[np.triu_indices_from(mask)] = True # 角分线右侧为Truencmap = sns.diverging_palette(220, 10, as_cmap=True) # 返回matplotlib colormap对象ng = sns.heatmap(n mcorr, mask=mask, cmap=cmap, square=True, annot=True, fmt="0.2f"n) # 热力图(看两两相似度)nplt.show()

将与target的相关性绝对值<0.1的列剔除:

# Threshold for removing correlated variablesnthreshold = 0.1n# Absolute value correlation matrixncorr_matrix = data_train1.corr().abs()ndrop_col = corr_matrix[corr_matrix["target"] < threshold].indexnprint(drop_col)ndata_all.drop(drop_col, axis=1, inplace=True)nprint(data_all.columns)

标准正态化:

# normalise numeric columnsncols_numeric = list(data_all.columns)ncols_numeric.remove("oringin")nnn# 计算标准分数ndef scale_minmax(col):n return (col - col.min()) / (col.max() - col.min())nnnscale_cols = [col for col in cols_numeric if col != "target"]ndata_all[scale_cols] = data_all[scale_cols].apply(scale_minmax, axis=0)nprint(data_all[scale_cols].describe())

Box-Cox变换:

# Check effect of Box-Cox transforms on distributions of continuous variablesnfcols = 6nfrows = len(cols_numeric) - 1nplt.figure(figsize=(4 * fcols, 4 * frows))ni = 0nfor var in cols_numeric:n if var != "target":n dat = data_all[[var, "target"]].dropna()n i += 1n plt.subplot(frows, fcols, i)n sns.distplot(dat[var], fit=stats.norm)n plt.title(var + " Original")n plt.xlabel("")nn i += 1n plt.subplot(frows, fcols, i)n _ = stats.probplot(dat[var], plot=plt)n plt.title("skew=" + "{:.4f}".format(stats.skew(dat[var])))n plt.xlabel("")n plt.ylabel("")nn i += 1n plt.subplot(frows, fcols, i)n plt.plot(dat[var], dat["target"], ".", alpha=0.5)n plt.title("corr=" + "{:.2f}".format(np.corrcoef(dat[var], dat["target"])[0][1]))nn i += 1n plt.subplot(frows, fcols, i)n trans_var, lambda_var = stats.boxcox(dat[var].dropna() + 1)n trans_var = scale_minmax(trans_var)n sns.distplot(trans_var, fit=stats.norm)n plt.title(var + " Tramsformed")n plt.xlabel("")nn i += 1n plt.subplot(frows, fcols, i)n _ = stats.probplot(trans_var, plot=plt)n plt.title("skew=" + "{:.4f}".format(stats.skew(trans_var)))n plt.xlabel("")n plt.ylabel("")nn i += 1n plt.subplot(frows, fcols, i)n plt.plot(trans_var, dat["target"], ".", alpha=0.5)n plt.title(n "corr=" + "{:.2f}".format(np.corrcoef(trans_var, dat["target"])[0][1])n )nplt.show()

变换:

for col in cols_transform:n data_all.loc[:, col], _ = stats.boxcox(data_all.loc[:, col] + 1)n"""print(data_all.target.describe())nplt.figure(figsize=(12, 4))nplt.subplot(1, 2, 1)nsns.distplot(data_all.target.dropna(), fit=stats.norm)nplt.subplot(1, 2, 2)n_ = stats.probplot(data_all.target.dropna(), plot=plt)nplt.show()"""

转换:

# Log Transform SalePrice to improve normalitynsp = data_train.targetndata_train.target1 = np.power(1.5, sp)nprint(data_train.target1.describe())

for column in data_all.columns[0:-2]:n g = sns.kdeplot(n data_all[column][(data_all["oringin"] == "train")], color="Red", shade=Truen )n g = sns.kdeplot(n data_all[column][(data_all["oringin"] == "test")],n ax=g,n color="Blue",n shade=True,n )n g.set_xlabel(column)n g.set_ylabel("Frequency")n g = g.legend(["train", "test"])n plt.show()

精选推荐

  • 催化燃烧设备供应商
    催化燃烧设备供应商

    催化燃烧设备供应商,催化燃烧处理装置厂家,本装置是采用废气先进入喷淋塔过滤——干式过滤—-蜂窝活性碳吸附—脱附再生——催化燃

  • 该不该有模具供应商
    该不该有模具供应商

    今天红姐又来跟我探讨供应商的管理问题了。故事是这样的:供应商来料不良,原因是模具问题。而那个模具是我们找的一家模具供应商做的

  • 什么牌子高压锅好,高压锅哪个牌子好,高压锅什么牌子好,高压锅哪个品牌好
    什么牌子高压锅好,高压锅哪个牌子好,高

    1苏泊尔 双重安全阀门 高压锅双重安全阀,防烫把手,复合底。这款高压锅拥有双重安全阀门,更好的保证使用安全。搭载防烫伤把手,方便起

  • 高压锅啥牌子好,高压锅哪个牌子的好,什么高压锅牌子好,高压锅推荐选购指南
    高压锅啥牌子好,高压锅哪个牌子的好,什

    1、双喜阶梯型复底高压锅推荐理由:高压锅满足上蒸下煮,饭菜同时进行,方便快速,有效提升烹饪效率。多重安全防护,安全系数较高,家人使用

0