本文将从四个案例 房价预测、泰坦尼克号生还预测、股票预测、影评情感预测 入手,让童鞋们从实战角度快速入门深度学习的预测部分!
房价预测
基于决策树回归器(DecisionTreeRegressor)
数据文件在这:
链接:https://pan.baidu.com/s/1mPr60cFUSc5m7pmF8Ju4vw 提取码:j2b0
#基于DecisionTreeRegressor预测北京房价
import numpy
import pandas as pd
import matplotlib
import seaborn
from sklearn.model_selection import GridSearchCV, ShuffleSplit, train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import make_scorer
import tensorflow
import numpy as np
#定义一堆函数
# 定义网格搜索最佳模型函数
def gridSearchVC_fit_model(X, y):
# 清洗和分割数据对象定义,
# 参数一:n_splits表示重新清洗和分割数据的迭代次数,默认值就是10
# 参数二:test_size=0.2表示有0.2的数据用于测试,也就是20%的测试数据,80%的训练数据
# 参数三:random_state表示随机数生成器的种子,如果希望第二次调用ShuffleSplit()方法时
# 和第一次调用的结果一致,那么就可以设置一个值,多少都可以,生产环境不要设值
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
# 创建决策树回归器对象
regressor = DecisionTreeRegressor(random_state=0)
# 创建一个字典,表示max_depth的值是从1到10
# 注意:如果是Python2的话,这个list()函数调用去掉
params = { "max_depth" : list(range(1, 10)) }
# 通过make_scorer()函数将上面定义的performance_metric()函数转换成计算分值函数
scoring_fnc = make_scorer(score_func=performance_metric)
# 创建网格搜索对象
# 参数一:评估器,就是回归器,这里表示的是决策树回归器
# 参数二:网格搜索参数
# 参数三:计算分值函数
# 参数四:cv(Cross-Validation)交叉验证,传入交叉验证生成器,或者可迭代对象
grid = GridSearchCV(estimator=regressor, param_grid=params,
scoring=scoring_fnc, cv=cv)
# 根据数据计算/训练适合网格搜索对象的最佳模型
grid = grid.fit(X, y)
# 返回计算得到的最佳模型
return grid.best_estimator_
# 预测房屋价格
def PredictHousingPrice(X, y, fitter):
# 迭代10次
epochs = 10
# 存储预测的价格
y_predict_test_price = None
# 分割训练集和测试集数据
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2, random_state=0)
# 迭代训练
for epoch_i in range(epochs):
# 根据数据训练模型,并返回最佳模型
reg = fitter(X_train, y_train)
# 预测测试数据
predicted_price = reg.predict(X_test)
y_predict_test_price = predicted_price
print("迭代第{}次。".format(epoch_i+1))
return y_test, y_predict_test_price
# 显示真实的房价和预测房价对比图
def plotVersusFigure(y_true_price, y_predict_price):
# 创建一个10x7英寸的窗口大小
plt.figure(figsize=(10, 7))
# 绘制的图1是真实的房价
X_show = np.rint(np.linspace(1,
np.max(y_true_price),
len(y_true_price))
).astype(int)
# 绘制图1线,plot()方法:
# 参数1:X轴方向的值,真实房价最低价和最高价
# 参数2:y轴方向的值,真实房价的值
# 参数3:绘制出来的线的样式风格,比如这里的"o"表示一个圆圈标记,而"-"表示实线
# 参数4:绘制的线的颜色,这里是青色
plt.plot(X_show, y_true_price, 'o-', color='c')
# 绘制的图2是预测的房价,叠加在图1上
X_show_predicted = np.rint(np.linspace(1,
np.max(y_predict_price),
len(y_predict_price))
).astype(int)
# 绘制图2线,plot()方法:
# 参数1:X轴方向的值,预测房价最低价和最高价
# 参数2:y轴方向的值,预测房价的值
# 参数3:绘制出来的线的样式风格,比如这里的"o"表示一个圆圈标记,而"-"表示实线
# 参数4:绘制的线的颜色,这里是洋红色
plt.plot(X_show_predicted, y_predict_price, 'o-', color='m')
# 添加标题
plt.title('Housing Prices Prediction')
# 添加图例
plt.legend(loc='lower right', labels=["True Prices", "Predicted Prices"])
# 添加X轴的标题
plt.xlabel("House's Price Tendency By Array")
# 添加y轴的标题
plt.ylabel("House's Price")
# 显示绘制
plt.show()
#开搞!
# 根据北京的房价数据来预测
# 加载数据集
df = pd.read_csv('bj_housing.csv')
df.describe()
bj_prices = df['Value']
bj_prices.head()
bj_features = df.drop('Value', axis=1)
bj_features.head()
y_true_bj_price, y_predict_bj_price =
PredictHousingPrice(bj_features, bj_prices, gridSearchVC_fit_model)
y_true_bj_price.reset_index().drop('index', axis=1).head()
pd.Series(y_predict_bj_price).head()
# 北京房屋价格对比图
plotVersusFigure(y_true_bj_price, y_predict_bj_price)
基于Keras
# 使用Keras来预测波士顿的房价预测
import tensorflow as tf
from tensorflow import keras
import numpy as np
# 加载波士顿的房价数据
(train_data, train_labels), (test_data, test_labels) =
keras.datasets.boston_housing.load_data()
# 清洗训练集数据
# np.random.random()表示在0.0到1.0之间返回指定个数的随机浮点数
# np.argsort()表示返回对数组进行排序的索引
order = np.argsort(np.random.random(train_labels.shape))
train_data = train_data[order]
train_labels = train_labels[order]
# 归一化处理数据
# 对不同的范围和比例进行归一化处理,并且每个元素都要减去均值除以标准差
# 模型虽然在没有特征归一化时也可以得到收敛,但是这会让训练更加困难,
# 而且会是结果模型很依赖于训练数据
mean = train_data.mean(axis=0)
std = train_data.std(axis=0)
train_data = (train_data - mean) / std
test_data = (test_data - mean) / std
print("train_data.shape: {}, train_labels.shape: {}."
.format(train_data.shape, train_labels.shape))
print("test_data.shape: {}, test_labels.shape: {}."
.format(test_data.shape, test_labels.shape))
# 创建模型函数
def build_model():
model = keras.Sequential([
keras.layers.Dense(64, activation=tf.nn.relu,
input_shape=(train_data.shape[1],)),
keras.layers.Dense(64, activation=tf.nn.relu),
keras.layers.Dense(1)
])
optimizer = tf.train.RMSPropOptimizer(0.001)
model.compile(loss='mse',
optimizer=optimizer,
metrics=['mae'])
return model
model = build_model()
# 查看模型的架构
model.summary()
# 自定义一个回调类,在每次epoch(代)结束时都会调用该函数
class PrintDot(keras.callbacks.Callback):
def on_epoch_end(self, epoch, logs):
if epoch % 100 == 0: print('')
print('.', end='')
EPOCHS = 500
# 训练模型
history = model.fit(train_data, train_labels, epochs=EPOCHS,
validation_split=0.2, verbose=0,
callbacks=[PrintDot()])
import matplotlib.pyplot as plt
# 绘制图来显示训练的误差历史
def plot_history(history):
plt.figure()
plt.xlabel('Epoch')
plt.ylabel('Mean Abs Error [1000$]')
plt.plot(history.epoch, np.array(history.history['mean_absolute_error']),
label='Train Loss')
plt.plot(history.epoch, np.array(history.history['val_mean_absolute_error']),
label='Val loss')
plt.legend()
plt.ylim([0, 5])
plt.show()
plot_history(history)
# 评估模型
[loss, mae] = model.evaluate(test_data, test_labels, verbose=0)
print("Testing set Mean Abs Error: ${:7.2f}".format(mae * 1000))
# 预测模型
test_predictions = model.predict(test_data).flatten()
plt.scatter(test_labels, test_predictions)
plt.xlabel('True Values [1000$]')
plt.ylabel('Predictions [1000$]')
plt.axis('equal')
plt.xlim(plt.xlim())
plt.ylim(plt.ylim())
plt.plot([-100, 100], [-100, 100])
plt.show()
# 查看预测值与真实的值得误差
error = test_predictions - test_labels
plt.hist(error, bins=50)
plt.xlabel("Prediction Error [1000$]")
plt.ylabel("Count")
plt.show()
# 显示真实的房价和预测房价对比图
def plotVersusFigure(y_true_price, y_predict_price):
# 创建一个10x7英寸的窗口大小
plt.figure(figsize=(10, 7))
# 绘制的图1是真实的房价
X_show = np.rint(np.linspace(1,
np.max(y_true_price),
len(y_true_price))
).astype(int)
# 绘制图1线,plot()方法:
# 参数1:X轴方向的值,真实房价最低价和最高价
# 参数2:y轴方向的值,真实房价的值
# 参数3:绘制出来的线的样式风格,比如这里的"o"表示一个圆圈标记,而"-"表示实线
# 参数4:绘制的线的颜色,这里是青色
plt.plot(X_show, y_true_price, 'o-', color='c')
# 绘制的图2是预测的房价,叠加在图1上
X_show_predicted = np.rint(np.linspace(1,
np.max(y_predict_price),
len(y_predict_price))
).astype(int)
# 绘制图2线,plot()方法:
# 参数1:X轴方向的值,预测房价最低价和最高价
# 参数2:y轴方向的值,预测房价的值
# 参数3:绘制出来的线的样式风格,比如这里的"o"表示一个圆圈标记,而"-"表示实线
# 参数4:绘制的线的颜色,这里是洋红色
plt.plot(X_show_predicted, y_predict_price, 'o-', color='m')
# 添加标题
plt.title('Housing Prices Prediction')
# 添加图例
plt.legend(loc='lower right', labels=["True Prices", "Predicted Prices"])
# 添加X轴的标题
plt.xlabel("House's Price Tendency By Array")
# 添加y轴的标题
plt.ylabel("House's Price")
# 显示绘制
plt.show()
# 对比真实的值和预测的值的图
plotVersusFigure(test_labels, test_predictions)
泰坦尼克号生还预测
提供1309行泰坦尼克号乘客数据,其中891行是训练数据,418行是测试数据,一共有12列,其中有一列表示乘客是否生还。
下面用sklearn(决策树、逻辑回归、梯度提升、多层感知机)和keras(DNN)实现乘客生还预测。
数据文件在这:
链接:https://pan.baidu.com/s/1o_FUa_4VxmqXVBMBGh4rog 提取码:apzg
基于Sklearn
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# 加载数据
features = pd.read_csv('titanic_dataset.csv')
y_train = features['Survived']
X_train = features.drop('Survived', axis=1)
# 预览前5条数据
X_train.head()
print("X_train.shape={}, y_train.shape={}".format(X_train.shape, y_train.shape))
X_train.info()
# 先看下数据集的 Age 分布状态
sns.distplot(X_train['Age'].dropna(), hist=True, kde=True)
# 将数据集中的NaN数据使用中值填充。
X_train['Age'].replace(np.nan, np.nanmedian(X_train['Age']), inplace=True)
sns.distplot(X_train['Age'], hist=True, kde=True)
# Cabin 的缺失值太多,从 Dataframe 中移除后,也不会影响预测的
X_train.drop("Cabin", axis=1, inplace=True)
# 我们来看下乘客都在哪些站登船的
# S 表示:Southampton,英国南安普敦
# C 表示:Cherbourg-Octeville,法国瑟堡-奥克特维尔
# Q 表示:Queenstown,爱尔兰昆士敦
X_train.Embarked.value_counts()
# 登船情况
sns.countplot(x='Embarked', data=X_train)
X_train['Embarked'].replace(np.nan, 'S', inplace=True)
# 数据集有一个缺失数据,我们把它找出来,然后附上中值
X_train[np.isnan(X_train["Fare"])]
# 查询从 英国南安普敦 上传,级别是3的船票价格
pclass3_fares = X_train.query('Pclass == 3 & Embarked == "S"')['Fare']
# 先将空值填充为0
pclass3_fares = pclass3_fares.replace(np.nan, 0)
# 然后取中值
median_fare = np.median(pclass3_fares)
# 最后更新中值到缺失值的那处
X_train.loc[X_train['PassengerId'] == 1044, 'Fare'] = median_fare
X_train['Sex'].replace(['male', 'female'], [1,0], inplace=True)
X_train.isnull().sum()
print("X_train.shape={}, y_train.shape={}".format(X_train.shape, y_train.shape))
X_train = pd.get_dummies(X_train)
# 预览 one-hot encoding 前5条数据
X_train.head()
print("X_train.shape={}, y_train.shape={}".format(X_train.shape, y_train.shape))
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(X_train, y_train, test_size=0.2, random_state=42, shuffle=True)
print("train_X.shape={}, train_y.shape={}".format(train_X.shape, train_y.shape))
print("test_X.shape={}, test_y.shape={}".format(test_X.shape, test_y.shape))
# 使用决策树预测模型
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score
# 创建决策树模型
def createDecisionTreeClassifier():
model = DecisionTreeClassifier()
# 训练模型
model.fit(train_X, train_y)
# 预测
train_pred = model.predict(train_X)
test_pred = model.predict(test_X)
# 计算精确度
train_accuracy = accuracy_score(train_y, train_pred)
test_accuracy = accuracy_score(test_y, test_pred)
print('The training accuracy is {}.'.format(train_accuracy))
print('The test accuracy is {}'.format(test_accuracy))
# ROC curve and AUC
y_score_dt = model.predict_proba(test_X)
fpr_dt, tpr_dt, thresholds_dt = metrics.roc_curve(test_y, y_score_dt[:,1])
print('Decision Tree Classifier AUC is: {:.3f}'.format(metrics.roc_auc_score(test_y, y_score_dt[:,1])))
return fpr_dt, tpr_dt
fpr_dt, tpr_dt = createDecisionTreeClassifier()
# 创建逻辑回归预测模型
from sklearn.linear_model import LogisticRegression
def createLogisticRegressionModel():
model = LogisticRegression()
model.fit(train_X, train_y)
print('Logistic Regression Accuracy for training data is: {:.3f}'.format(model.score(train_X, train_y)))
print('Logistic Regression Accuracy for testing data is: {:.3f}'.format(model.score(test_X, test_y)))
y_score_lr = model.decision_function(test_X)
print('Logistic Regression AUC is: {:.3f}'.format(metrics.roc_auc_score(test_y, y_score_lr)))
fpr_lr, tpr_lr, thresholds_lr = metrics.roc_curve(test_y, y_score_lr)
return fpr_lr, tpr_lr
fpr_lr, tpr_lr = createLogisticRegressionModel()
# 创建梯度提升模型
from sklearn.ensemble import GradientBoostingClassifier
def createGradientBoostingClassifierModel():
model = GradientBoostingClassifier(n_estimators = 500)
model.fit(train_X, train_y)
# 预测
train_pred = model.predict(train_X)
test_pred = model.predict(test_X)
print('Gradient Boosting Accuracy for training data is: {:.3f}'.format(accuracy_score(train_y, train_pred)))
print('Gradient Boosting Accuracy for testing data is: {:.3f}'.format(accuracy_score(test_y, test_pred)))
# ROC 曲线 和 AUC
y_score_gb = model.predict_proba(test_X)
fpr_gb, tpr_gb, thresholds_gb = metrics.roc_curve(test_y, y_score_gb[:,1])
print('Gradient Boosting Classifier AUC is: {:.3f}'.format(metrics.roc_auc_score(test_y, y_score_gb[:,1])))
return fpr_gb, tpr_gb
fpr_gb, tpr_gb = createGradientBoostingClassifierModel()
# 创建多层感知器的预测模型
from sklearn.neural_network import MLPClassifier
def createMLPClassifierModel():
model = MLPClassifier(hidden_layer_sizes=128, batch_size=64, max_iter=1000, solver="adam")
model.fit(train_X, train_y)
# 预测
train_pred = model.predict(train_X)
test_pred = model.predict(test_X)
print('Neural Network classifier Accuracy for training data is: {:.3f}'.format(accuracy_score(train_y, train_pred)))
print('Neural Network classifier Accuracy for testing data is: {:.3f}'.format(accuracy_score(test_y, test_pred)))
# ROC curve and AUC
y_score_nn = model.predict_proba(test_X)
fpr_nn, tpr_nn, thresholds_nn = metrics.roc_curve(test_y, y_score_nn[:,1])
print('Neural Network Classifier AUC is: {:.3f}'.format(metrics.roc_auc_score(test_y, y_score_nn[:,1])))
return fpr_nn, tpr_nn
fpr_nn, tpr_nn = createMLPClassifierModel()
# 全部模型的训练曲线画图!
fig = plt.figure(figsize = (20,10))
ax = fig.add_subplot(111)
ax1 = ax.plot(fpr_dt, tpr_dt, c='c', lw=2, label="Decision Tree")
ax2 = ax.plot(fpr_lr, tpr_lr, c='y', lw=2, label="Logistic Regression")
ax3 = ax.plot(fpr_gb, tpr_gb, c='r', lw=2, label="Gradient Boosting")
ax4 = ax.plot(fpr_nn, tpr_nn, c='b', lw=2, label="Neural Network")
ax.grid()
lns = ax1 + ax2 + ax3 + ax4
ax.legend(lns, loc=0)
plt.show()
train_X.shape
基于Keras
# Keras的神经网络模型来预测
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras import utils as np_utils
# 加载数据
features = pd.read_csv('titanic_dataset.csv')
y_train = features['Survived']
X_train = features.drop('Survived', axis=1)
# 将数据集中的NaN数据使用中值填充。
X_train['Age'].replace(np.nan, np.nanmedian(X_train['Age']), inplace=True)
sns.distplot(X_train['Age'], hist=True, kde=True)
# Cabin 的缺失值太多,从 Dataframe 中移除后,也不会影响预测的
X_train.drop("Cabin", axis=1, inplace=True)
X_train.Embarked.value_counts()
# 登船情况
sns.countplot(x='Embarked', data=X_train)
X_train['Embarked'].replace(np.nan, 'S', inplace=True)
# 数据集有一个缺失数据,我们把它找出来,然后附上中值
X_train[np.isnan(X_train["Fare"])]
# 查询从 英国南安普敦 上传,级别是3的船票价格
pclass3_fares = X_train.query('Pclass == 3 & Embarked == "S"')['Fare']
# 先将空值填充为0
pclass3_fares = pclass3_fares.replace(np.nan, 0)
# 然后取中值
median_fare = np.median(pclass3_fares)
# 最后更新中值到缺失值的那处
X_train.loc[X_train['PassengerId'] == 1044, 'Fare'] = median_fare
X_train['Sex'].replace(['male', 'female'], [1,0], inplace=True)
X_train.isnull().sum()
print("X_train.shape={}, y_train.shape={}".format(X_train.shape, y_train.shape))
X_train = pd.get_dummies(X_train)
# 预览 one-hot encoding 前5条数据
X_train.head()
print("X_train.shape={}, y_train.shape={}".format(X_train.shape, y_train.shape))
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(X_train, y_train, test_size=0.2, random_state=42, shuffle=True)
print("train_X.shape={}, train_y.shape={}".format(train_X.shape, train_y.shape))
print("test_X.shape={}, test_y.shape={}".format(test_X.shape, test_y.shape))
def createKerasModel(X, y):
# 创建模型
model = Sequential()
# 内核初始化器就使用截断正态分布
initializers = keras.initializers.TruncatedNormal(mean=0.0, stddev=0.05, seed=None)
# 输入层维度是 X.shape[1]
model.add(Dense(input_dim=X.shape[1], units=128, kernel_initializer=initializers, bias_initializer='zeros'))
model.add(Activation("relu"))
model.add(Dropout(0.2))
model.add(Dense(32))
model.add(Activation("relu"))
model.add(Dense(2))
# 输出的结果是要么1,要么0,所以使用 sigmoid激活函数
model.add(Activation("sigmoid"))
# 编译使用二进制交叉熵,adam优化器自行调整
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# 将训练数据的y进行独热编码(one-hot encoding)
y_train_categorical = np_utils.to_categorical(y)
# 训练模型,epochs表示要训练150次,verbose表示训练每批次时输出日志信息
model.fit(X.values, y_train_categorical, epochs=150, verbose=1)
return model
keras_model = createKerasModel(train_X, train_y)
y_test_categorical = np_utils.to_categorical(test_y)
loss_and_accuracy = keras_model.evaluate(test_X.values, y_test_categorical)
print("Loss={}, Accuracy={}.".format(loss_and_accuracy[0], loss_and_accuracy[1]))
predictions_classes = keras_model.predict_classes(test_X.values)
submission = pd.DataFrame({
"PassengerId": test_X["PassengerId"],
"Survived": predictions_classes})
print(submission[0:15])
股票预测
根据3000多条的百度股票数据,预测出股票曲线。
数据通过quandl开源库获取,使用Facebook开源的fbprophet库来进行股票价格预测。
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
!pip install quandl
import quandl
!pip install fbprophet
import fbprophet
def init_api_key():
quandl.save_key("Your API Key")
print(quandl.ApiConfig.api_key)
init_api_key()
quandl.read_key()
print(quandl.ApiConfig.api_key)
def init_stock(stock_name):
#获取股票数据
stock = quandl.get("WIKI/{}".format(stock_name))
#设置列Date为第一列
stock = stock.reset_index(level=0)
return stock
#获取百度所有数据
stock_name = "BIDU"
baiduStock = init_stock(stock_name)
baiduStock.head()
print("baiduStock共计{}条。".format(len(baiduStock)))
min_date = min(baiduStock['Date'])
max_date = max(baiduStock['Date'])
print("百度的股票数据从{}到{}。".format(min_date, max_date))
print(type(baiduStock))
baiduStock.to_csv("baiduStock.csv", index=False)
baidu_df = pd.read_csv("baiduStock.csv")
baidu_df.head()
#数据可视化
def plot_basic_stock_history(df, start_date, end_date, stock_name):
stats_Ajd_Close = 'Adj. Close'
stat_min = min(df[stats_Ajd_Close])
stat_max = max(df[stats_Ajd_Close])
stat_mean = np.mean(df[stats_Ajd_Close])
date_stat_min = df[df[stats_Ajd_Close] == stat_min]['Date']
date_stat_min = date_stat_min[date_stat_min.index[0]].date()
date_stat_max = df[df[stats_Ajd_Close] == stat_max]['Date']
date_stat_max = date_stat_max[date_stat_max.index[0]].date()
print("{}在{}最小,价格是:{}美元。".format(stats_Ajd_Close, date_stat_min, stat_min))
print("{}在{}最高,价格是:{}美元。".format(stats_Ajd_Close, date_stat_max, stat_max))
print("{}在{}当前价格是:{}美元。".format(stats_Ajd_Close, end_date.date(), df.loc[df.index[-1], 'Adj. Close']))
plt.style.use("default")
plt.plot(df["Date"],
df[stats_Ajd_Close],
color='r',
linewidth=3,
label=stats_Ajd_Close)
plt.xlabel("Date")
plt.ylabel("US $")
plt.title("{} Stock History".format(stock_name))
plt.grid()
plt.show()
start_date = min_date
end_date = max_date
plot_basic_stock_history(baiduStock, start_date, end_date, stock_name)
#计算购买的股票收益
def plot_potential_profit(df,
start_date,
end_date,
stock_name,
line_color,
text_color,
myshares=1):
start_price = float(df[df["Date"] == start_date]["Adj. Open"])
end_price = float(df[df["Date"] == end_date]["Adj. Close"])
df["profits"] = (df["Adj. Close"] - start_price) * myshares
total_hold_profit = (end_price - start_price) * myshares
print("从{}到{},购买{}股,总收益是:{}美元。".format(start_date.date(),
end_date.date(),
myshares,
total_hold_profit))
plt.style.use("default")
plt.plot(df["Date"], df["profits"], color=line_color, linewidth=3)
plt.xlabel("Date")
plt.ylabel("Profit $")
plt.title("My Shares From {} to {} on {}.".format(start_date.date(), end_date.date(), stock_name))
text_location_x = (end_date - pd.DateOffset(months=1)).date()
text_location_y = total_hold_profit + (total_hold_profit / 40)
plt.text(text_location_x,
text_location_y,
"${}".format(int(total_hold_profit)),
color=text_color,
size=15)
plt.grid()
plt.show()
start_date = min_date
end_date = max_date
plot_potential_profit(baiduStock, start_date, end_date, stock_name, 'm', 'g', 100)
# 倘若在2012年到2013年之间持股的话,差不多就会亏损一半哦,可是谁又知道了?他们最后涨了那么多
start_date = pd.to_datetime("2012-08-07")
end_date = pd.to_datetime("2013-03-05")
baiduStockLowerPricePhase = baiduStock[
(baiduStock['Date'] >= start_date.date()) &
(baiduStock['Date'] <= end_date.date())
]
plot_potential_profit(baiduStockLowerPricePhase, start_date, end_date, stock_name, 'c', 'r', 100)
#训练和评估模型
def train_model(stock_history, days=0, weekly_seasonality=False, monthly_seasonality=False):
model = fbprophet.Prophet(daily_seasonality=False,
weekly_seasonality=False,
yearly_seasonality=True,
changepoint_prior_scale=0.05)
if monthly_seasonality:
model.add_seasonality(name='monthly', period=30.5, fourier_order=5)
model.fit(stock_history)
future = model.make_future_dataframe(periods=days)
future = model.predict(future)
return model, future
def create_prophet_model(df,
stock_name,
days=0,
weekly_seasonality=False,
monthly_seasonality=False):
stock_history = df[df["Date"] > (max_date - pd.DateOffset(years=3)).date()]
model, future = train_model(stock_history, days, weekly_seasonality, monthly_seasonality)
plt.style.use("default")
fig, ax = plt.subplots(1, 1)
fig.set_size_inches(10, 5)
# 绘制真实的值
ax.plot(stock_history['ds'],
stock_history['y'],
'v-',
linewidth=1.0,
alpha=0.8,
ms=1.8,
label='Observations')
# 绘制预测的值
ax.plot(future['ds'],
future['yhat'],
'o-',
linewidth=1.,
label='Modeled')
# 使用带状绘制一个不确定的区间值
ax.fill_between(future['ds'].dt.to_pydatetime(),
future['yhat_upper'],
future['yhat_lower'],
alpha=0.3,
facecolor='g',
edgecolor='k',
linewidth=1.0,
label='Confidence Interval')
plt.legend(loc=2, prop={'size': 10})
plt.title("{} Historical and Modeled Stock Price".format(stock_name))
plt.xlabel('Date')
plt.ylabel('Price $')
plt.grid(linewidth=0.6, alpha=0.6)
plt.show()
return model, future
baiduStock["ds"] = baiduStock['Date']
baiduStock["y"] = baiduStock['Adj. Close']
model, future_data = create_prophet_model(baiduStock, stock_name, monthly_seasonality=True)
model.plot_components(future_data)
plt.show()
model, future_data = create_prophet_model(baiduStock, stock_name, weekly_seasonality=True, monthly_seasonality=True)
model.plot_components(future_data)
plt.show()
#股票预测,基于时间序列预测未来180天的百度股票价格
model, future = create_prophet_model(baiduStock, stock_name, days=180)
#股票买入策略
import prophet_evaluator
baiduStock["ds"] = baiduStock['Date']
baiduStock["y"] = baiduStock['Adj. Close']
prophet_evaluator.evaluator(baiduStock, min_date, max_date, train_model, stock_name, 1000)
影评的情感分析
情感分析在自然语言处理(NLP)领域是很复杂 的,有主观的,也有客观的。基于当前环境,针对不同的人或物,我们应该做出什么样的情感反应。下面讲解如何通过分析情感文本数据,预测出说话者在当时的情况下的情绪状态是积极的,还是消极的。
生活中就有很多例子,比如在京东、淘宝等电商 平台购物后,用户都会被请求对收到的货物进行拍 照、点赞、评论和评价星级等。平台收集这些数据后 去做情感分析,从而通过了解买家对于产品的喜好和满意度来改善产品和服务。这为平台提供了一些潜在 的用户会购买哪些产品的数据。
下面使用循环神经网络(RNN)来编写该神经网络模型的代码,创建此网络模型会使用到长短期记忆 网络(LSTM)和嵌入层(Embedding Layers),最后的输出层会使用sigmoid激活函数,因为我们预测的结果要么是积极的,要么是消极的。
数据文件在这:
链接:https://pan.baidu.com/s/1DQdAROwzOT6nXdWBYeT2bw 密码:1rn7
基于TensorFlow
import numpy as np
import tensorflow as tf
# 定义加载数据的函数
def loadData():
# 加载评论(字符串)
with open('reviews.txt', 'r') as f:
reviews = f.read()
# 加载评论(字符串)的对应标签,是积极的还是消极的
with open('labels.txt', 'r') as f:
labels = f.read()
# 返回评论和标签
return reviews, labels
# 调用函数
reviews, labels = loadData()
# 查看评论的前150个字符是什么
reviews[:150]
# 查看评论的对应标签的前150个字符是什么
labels[:150]
from string import punctuation
# 定义数据预处理函数
def dataPreprocess(reviews_str):
# 通过列表推导式将reviews_str字符串里的包含各种标点符号去掉,并返回一个字符组成的数组
# 然后通过join()函数将数组里的元素都连接成一个长长的字符串
all_text = ''.join(
[review for review in reviews_str if review not in punctuation])
# 将该字符串通过n换行符分割成数组
review_list = all_text.split('n')
# 将数组里的元素通过空格连接起来,形成一个长长的字符串
all_text = ' '.join(review_list)
# 然后通过使用split()函数的默认分隔符-空格来将字符串分割成一个个单词的数组
words = all_text.split()
return review_list, all_text, words
# 调用函数
reviews, all_text, words = dataPreprocess(reviews)
reviews[:2]
# 查看前20个元素(单词)
words[:20]
# 查看前150个字符串
all_text[:150]
# 单词编码
from collections import Counter
# 统计单词的重复个数
word_counter = Counter(words)
# 将变量word_counter根据默认顺序进行逆序排序(从大到小),使用sorted方法,逆序设置参数reverse=True
sorted_vocab = sorted(word_counter, key=word_counter.get, reverse=True)
# 定义显示前10个单词以及它的重复个数的函数
def showTop10Item(dict_obj):
word_index = 0
for k, v in dict_obj.items():
if word_index >= 10:
break
print("{}:{}".format(k, v))
word_index+=1
# 显示变量word_counter里的单词和它对应的数量
showTop10Item(word_counter)
# 按照单词出现的数量从大到小的排序,查看前15个单词的出现次数
word_counter.most_common(15)
# 查看排序后的前15个单词,和上面显示的结果一样
sorted_vocab[:15]
# 创建单词对应的索引关系字典
vocab_to_int = {word: i for i, word in enumerate(sorted_vocab, 1)}
# 然后显示前10个单词以及它的个数
showTop10Item(vocab_to_int)
# 将每个单词的索引位置取出来,然后添加到reviews_ints数组里
# 也就是说,现在字符串里的每个单词,不是原来的单词字符串了,而是一个数值,表示它的索引
reviews_ints = []
for review in reviews:
reviews_ints.append([vocab_to_int[word] for word in review.split()])
print(reviews_ints[:1])
len(reviews_ints)
# 标签编码
# 对positive进行编码为1,negative为0
labels = labels.split('n')
labels = np.array([1 if label == 'positive' else 0 for label in labels])
# 查看前10个编码标签值
labels[:10]
from collections import Counter
review_lens = Counter([len(x) for x in reviews_ints])
print("评论的最小长度是: {}".format(review_lens[0]))
print("评论的最大长度是: {}".format(max(review_lens)))
# 过滤掉评论的字符串长度为0的情况,并返回长度非零的索引,形成数组并返回
non_zero_idx = [i for i, review in enumerate(reviews_ints) if len(review) != 0]
# 去掉字符串长度为0的情况后,还有多少个评论
print(len(non_zero_idx))
# 通过变量non_zero_idx索引数组,过滤掉变量reviews_ints里的字符串为0的情况
reviews_ints = [reviews_ints[i] for i in non_zero_idx]
# 过滤掉由于上面的字符串长度为0的那一行评论后,它对应的标签也需要过滤掉
labels = np.array([labels[i] for i in non_zero_idx])
# 现在,我们要创建一个features的变量来作为特征向量(Feature Vector),这个数据就是我们要传递到神经网络中的,
# 数据来自于reviews_ints变量。因为我们要传递整型的数值到神经网络中,且每行的数值不能
# 超过200个;所以就是,不足200长度的评论,前面使用0来填充;超过200长度的,我们截断前
# 200个字符串的长度。
# 定义一个评论的字符串最大长度是200
seq_len = 200
# 创建一个矩阵,里面的值都默认是0
features = np.zeros((len(reviews_ints), seq_len), dtype=int)
# 将reviews_ints里的值都截断在200的长度,并填充到变量features里。
# 不足200长度的,就是它本身长度
for i, row in enumerate(reviews_ints):
# 评论长度不足200的,我们在前面使用0来填充
features[i, -len(row):] = np.array(row)[:seq_len]
# 查看第一个
features[0:1]
features.shape
# 拆分训练集、验证集和测试集数据
# 定义80%的数据用于训练
split_train_ratio = 0.8
# 特征向量的长度
features_len = len(features)
# 训练集的个数
train_len = int(features_len * split_train_ratio)
# 分割出训练集和验证集的数据
train_x, val_x = features[:train_len], features[train_len:]
train_y, val_y = labels[:train_len], labels[train_len:]
# 将验证集的数量折半
val_x_half_len = int(len(val_x) / 2)
# 将验证集数据分成一半验证集,另一半测试集
val_x, test_x = val_x[:val_x_half_len], val_x[val_x_half_len:]
val_y, test_y = val_y[:val_x_half_len], val_y[val_x_half_len:]
# 输出打印
print("tttFeature Shapes:")
print("Train set: tt{}".format(train_x.shape),
"nValidation set: t{}".format(val_x.shape),
"nTest set: tt{}".format(test_x.shape))
# 定义超参数
lstm_size = 256
lstm_layers = 2
batch_size = 512
learning_rate = 0.01
# 获取单词的总长度
n_words = len(vocab_to_int) + 1
# 创建默认计算图对象
tf.reset_default_graph()
# 给计算图上的张量的输入占位符添加一个前缀inputs
with tf.name_scope('inputs'):
# 输入特征占位符
inputs_ = tf.placeholder(tf.int32, [None, None], name="inputs")
# 输入标签占位符
labels_ = tf.placeholder(tf.int32, [None, None], name="labels")
# 保留率占位符
keep_prob = tf.placeholder(tf.float32, name="keep_prob")
# 嵌入向量的大小
embed_size = 300
# 给计算图上的张量的嵌入层变量和查找表添加一个前缀Embeddings
with tf.name_scope("Embeddings"):
# 均匀分布初始化嵌入层的变量,范围是-1到1之间
embedding = tf.Variable(tf.random_uniform((n_words, embed_size), -1, 1))
# 将输入特征占位符传入嵌入查找表
embed = tf.nn.embedding_lookup(embedding, inputs_)
def lstm_cell():
# 创建基础LSTM cell
lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size, reuse=tf.get_variable_scope().reuse)
# 添加dropout层到cell上
return tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)
# 给graph上的tensors的RNN层添加一个前缀RNN_layers
with tf.name_scope("RNN_layers"):
# 创建多个LSTM层
cell = tf.contrib.rnn.MultiRNNCell([lstm_cell() for _ in range(lstm_layers)])
# 获取一个初始化状态,默认值都是0
initial_state = cell.zero_state(batch_size, tf.float32)
with tf.name_scope("RNN_forward"):
# 通过dynamic_rnn可以返回每一步的输出和隐藏层的最后状态
outputs, final_state = tf.nn.dynamic_rnn(cell, embed, initial_state=initial_state)
with tf.name_scope('predictions'):
# 创建输出层,由于我们预测的输出是1或者0,所以sigmoid激活函数是最好的选择
predictions = tf.contrib.layers.fully_connected(outputs[:, -1], 1, activation_fn=tf.sigmoid)
with tf.name_scope('cost'):
# 定义均方差训练损失函数
cost = tf.losses.mean_squared_error(labels_, predictions)
with tf.name_scope('train'):
# 定义训练优化器
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)
with tf.name_scope('validation'):
# 计算验证精确度
correct_pred = tf.equal(tf.cast(tf.round(predictions), tf.int32), labels_)
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
# 定义获取数据批次的生成器函数
def get_batches(x, y, batch_size=100):
# 计算得出有多少个批次,这里是整除,所以假如x的总数不能被batch_size整除,
# 那么会剩下很小的一部分数据暂时会被丢弃
n_batches = len(x)//batch_size
# 然后再次确定x和y的数据集的数据
x, y = x[:n_batches*batch_size], y[:n_batches*batch_size]
# 通过for循环,使用yield关键字构建生成器函数
for ii in range(0, len(x), batch_size):
yield x[ii:ii+batch_size], y[ii:ii+batch_size]
# 设置迭代次数,8次
epochs = 8
# 创建检查点保存对象
saver = tf.train.Saver()
# 创建一个TensorFlow会话
with tf.Session() as sess:
# 初始化全局变量
sess.run(tf.global_variables_initializer())
iteration = 1
# 开始迭代
for e in range(epochs):
# 首次计算初始化状态
state = sess.run(initial_state)
# 将所有的数据都进行训练,get_batches()函数会获取数据生成器,然后进行迭代
for ii, (x, y) in enumerate(get_batches(train_x, train_y, batch_size), 1):
feed = {inputs_: x,
labels_: y[:, None],
keep_prob: 0.5,
initial_state: state}
loss, state, _ = sess.run([cost, final_state, optimizer], feed_dict=feed)
# 每训练5次时,打印一次训练日志
if iteration%5==0:
print("Epoch: {}/{}".format(e, epochs),
"Iteration: {}".format(iteration),
"Train loss: {:.3f}".format(loss))
# 每训练25次时,打印一次验证日志
if iteration%25==0:
val_acc = []
val_state = sess.run(cell.zero_state(batch_size, tf.float32))
# 对验证集的所有数据进行计算分值
for x, y in get_batches(val_x, val_y, batch_size):
feed = {inputs_: x,
labels_: y[:, None],
keep_prob: 1,
initial_state: val_state}
batch_acc, val_state =
sess.run([accuracy, final_state], feed_dict=feed)
# 每25次训练后,完全的验证一次,得到验证分值,保存在数组val_acc里,
val_acc.append(batch_acc)
# 打印每25次训练后,验证的均值
print("Val acc: {:.3f}".format(np.mean(val_acc)))
iteration +=1
# 每批次时都记录检查点
saver.save(sess, "checkpoints/sentiment.ckpt")
# 当所有的数据迭代训练完毕后,最后记录一次检查点
saver.save(sess, "checkpoints/sentiment.ckpt")
test_acc = []
with tf.Session() as sess:
# 从检查点恢复已训练的模型
saver.restore(sess, "checkpoints/sentiment.ckpt")
# 在计算测试集数据前,先创建一个空的状态
test_state = sess.run(cell.zero_state(batch_size, tf.float32))
# 获取测试集数据生成器
for ii, (x, y) in enumerate(get_batches(test_x, test_y, batch_size), 1):
feed = {inputs_: x,
labels_: y[:, None],
keep_prob: 1,
initial_state: test_state}
# 开始批次计算测试集数据
batch_acc, test_state = sess.run([accuracy, final_state], feed_dict=feed)
# 将每个批次的得分保存到数组
test_acc.append(batch_acc)
# 最后输出测试得分均值,即精确度
print("Test accuracy: {:.3f}".format(np.mean(test_acc)))
基于Keras
#基于Keras
import numpy
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
# 为了确保可复现性,我们设置一个随机种子
numpy.random.seed(7)
# 设置5000的意思是,只保留前面5000个以内常见的单词,其它的都为0
top_words = 5000
# 加载数据集
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)
# 设置单个影评的最大长度是500
review_max_length = 500
# 影评长度不够500的用0填充,超过500的截断
X_train = sequence.pad_sequences(X_train, maxlen=review_max_length)
X_test = sequence.pad_sequences(X_test, maxlen=review_max_length)
# 创建模型
embedding_vecor_length = 32
model = Sequential()
# 添加输入嵌入层
model.add(Embedding(top_words, embedding_vecor_length, input_length=review_max_length))
# 添加LSTM隐藏层
model.add(LSTM(100))
# 添加输出层(全连接层),二分类问题,使用sigmoid激活函数
model.add(Dense(1, activation='sigmoid'))
# 编译模型,二分类问题,使用二进制交叉熵来计算损失
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# 输出显示模型架构
model.summary()
# 训练模型,所有的训练数据集都要经过3次训练,每次训练时的每批次大小是64个
model.fit(X_train, y_train, epochs=3, batch_size=64)
# 最后评估模型
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: {}".format((scores[1]*100)))