支付宝红包
京东盲盒抽奖
幸运转盘
秒杀
自营热卖
支付宝红包

02. 对数几率回归

字句皆是你 1年前   阅读数 198 0
# 对数几率回归

# 对数几率回归回答是或否的问题

# sigmoid函数: 输入一个值,返回一个从0到1的值

# 对于分类问题使用的损失函数时交叉熵, 交叉熵能够输出一个更大的损失值,从而使梯度下降法做出更大的优化


import tensorflow as tf
import numpy as np
import pandas as pd

data = pd.read_csv('./data/titanic/train.csv')

# 列名
print(data.columns)

# 选出比较重要的数据
data = data[['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]

# 对空值使用0填充
data = data.fillna(0)

# print(data)

# 将string转为数值
data['Sex'] = pd.factorize(data.Sex)[0]

# Pclass值为1,2,3,会对机器学习造成困扰,进行独热编码,将三个等级转换为三维,去掉潜在的线性关系
data['p1'] = np.array(data['Pclass'] == 1).astype(np.float32)
data['p2'] = np.array(data['Pclass'] == 2).astype(np.float32)
data['p3'] = np.array(data['Pclass'] == 3).astype(np.float32)
del data['Pclass']
# print(data)

# Embarked独热编码
data['e1'] = np.array(data['Embarked'] == 'S').astype(np.float32)
data['e2'] = np.array(data['Embarked'] == 'C').astype(np.float32)
data['e3'] = np.array(data['Embarked'] == 'Q').astype(np.float32)
del data['Embarked']
# print(data)

# 将数据转换为np.array
# np.stack: 把每一列的数据放到一个array中,如Sex列放入一个数组中
data_data = np.stack(
    [data.Sex.values.astype(np.float32), data.Age.values.astype(np.float32), data.SibSp.values.astype(np.float32),
     data.Parch.values.astype(np.float32), data.Fare.values.astype(np.float32), data.p1.values,
     data.p2.values, data.p3.values, data.e1.values, data.e2.values, data.e3.values]).T

# data.Survived是一个894长度的向量,需要变成894行一列,与特征值对应
# print(np.shape(data_data))
# print(np.shape(data.Survived))
data_target = np.reshape(data.Survived.values.astype(np.float32), (891, 1))
# print(np.shape(data_target))

# 定义网络

# 输入, 不设定一次放多少行,但一定时11列
x = tf.placeholder('float', shape=[None, 11])
# 输出
y = tf.placeholder('float', shape=[None, 1])

# 矩阵相乘,行数必须与前面的列数一样, weight输出值是一个标量值,所以时一列
weight = tf.Variable(tf.random_normal([11, 1]))
bias = tf.Variable(tf.random_normal([1]))

# 定义输出: 矩阵相乘
output = tf.matmul(x, weight) + bias

# 推断: sigmoid输出概率值,大于0.5判断为1, 小于0.5判断为0, 使用tf.cast把布尔值转换为floag32
pred = tf.cast(tf.sigmoid(output) > 0.5, tf.float32)

# 使用交叉熵计算损失, 实际值labels为y, 推断值logits为output, 计算交叉熵会进行sigmoid处理
loss = tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=output))

# 梯段下降法训练
train_step = tf.train.GradientDescentOptimizer(0.001).minimize(loss)

# 计算正确率, 判断推断值和实际值是否相等。 reduce_mean:求均值
accuracy = tf.reduce_mean(tf.cast(tf.equal(pred, y), tf.float32))

sess = tf.Session()
sess.run(tf.global_variables_initializer())

for i in range(10000):
    # 尽量乱序输入,没10000步所有数据都进行一次循环。
    # 每100行输入进行
    for n in range(len(data_target) // 100):
        # 乱序
        index = np.random.permutation(len(data_target))

        # 打乱顺序
        data_data = data_data[index]
        data_target = data_target[index]

        batch_xs = data_data[n: n + 100]
        batch_ys = data_target[n: n + 100]

        sess.run(train_step, feed_dict={x: batch_xs, y: batch_ys})
    if i % 1000 == 0:
        print(sess.run((loss, accuracy), feed_dict={x: batch_xs, y: batch_ys}))

data_test = pd.read_csv('./data/titanic/test.csv')
data_test = data_test.fillna(0)
data_test['Sex'] = pd.factorize(data_test.Sex)[0]
data_test['p1'] = np.array(data_test['Pclass'] == 1).astype(np.float32)
data_test['p2'] = np.array(data_test['Pclass'] == 2).astype(np.float32)
data_test['p3'] = np.array(data_test['Pclass'] == 3).astype(np.float32)
data_test['e1'] = np.array(data_test['Embarked'] == 'S').astype(np.float32)
data_test['e2'] = np.array(data_test['Embarked'] == 'C').astype(np.float32)
data_test['e3'] = np.array(data_test['Embarked'] == 'Q').astype(np.float32)
test_data = np.stack([data_test.Sex.values.astype(np.float32), data_test.Age.values.astype(np.float32),
                      data_test.SibSp.values.astype(np.float32),
                      data_test.Parch.values.astype(np.float32), data_test.Fare.values.astype(np.float32),
                      data_test.p1.values,
                      data_test.p2.values, data_test.p3.values, data_test.e1.values, data_test.e2.values,
                      data_test.e3.values]).T

test_lable = pd.read_csv('./data/titanic/gender_submission.csv')

test_lable = np.reshape(test_lable.Survived.values.astype(np.float32), (418, 1))

sess.run(accuracy, feed_dict={x: test_data, y: test_lable})

sess.close()


注意:本文归作者所有,未经作者允许,不得转载

全部评论: 0

    我有话说: