加载中...

ml_week2


数据可视化

① 读取数据

一般在读数据的时候加上列标题,所以事先因该大致检查下数据的形式。

path = 'ex2data1.txt'
data = pd.read_csv(path,header=None,names = ['exam1','exam2','admitted'])
data.head()
data.describe()  #对数据中每一列数进行统计分析
② 画图

画图可以用 DataFrame.plot ,也可以直接用 matplotlib , 做作业的时候发现答案基本都用的后者。

我的做法:

#筛选出admitted和unadmitted
admitted = data.loc[data.values[:,2] == 1]
unadmitted = data.loc[data.values[:,2] == 0]
#用DataFrame.plot画的图,看了答案后,觉得自己麻烦了
ax = admitted.plot.scatter(x='exam1', y='exam2', color='b', marker='+',label='admitted')

unadmitted.plot.scatter(x='exam1', y='exam2',color='y', marker="o", label='unadmitted',ax=ax)

plt.xlabel("Exam 1 score")
plt.ylabel("Exam 2 score")

plt.show()

答案:

positive = data[data['Admitted'].isin([1])]
negative = data[data['Admitted'].isin([0])]

fig, ax = plt.subplots(figsize=(12,8))  #做完线性回归和逻辑回归后,发现这几部基本是画图通用步骤
ax.scatter(positive['Exam 1'], positive['Exam 2'], s=50, c='b', marker='o', label='Admitted')
ax.scatter(negative['Exam 1'], negative['Exam 2'], s=50, c='r', marker='x', label='Not Admitted')
ax.legend()
ax.set_xlabel('Exam 1 Score')
ax.set_ylabel('Exam 2 Score')
plt.show()

逻辑回归

① 逻辑函数
#python代码
import numpy as np

def sigmoid(z):

   return 1 / (1 + np.exp(-z))
② 代价函数
#python代码
import numpy as np

def cost(theta, X, y):

  theta = np.matrix(theta)
  X = np.matrix(X)
  y = np.matrix(y)
  first = np.multiply(-y, np.log(sigmoid(X* theta.T)))
  second = np.multiply((1 - y), np.log(1 - sigmoid(X* theta.T)))
  return np.sum(first - second) / (len(X))
③ 梯度下降
#根据第一节课作业答案改写的,但是后来发现作业中用了SciPy's truncated newton寻找最优参数
def gradienDescent(X,y,theta,alpha,iters):
  theta = np.matrix(theta)
  temp = np.matrix(np.zeros(theta.shape))
  #parameters计算有多少个数
  parameters = int(theta.ravel().shape[1])
  costs = np.zeros(iters)

  for i in range(iters):
    error = sigmoid(X * theta.T) -y

    for j in range(parameters):
      term = np.multiply(error,X[:,j])
      temp[0,j] = theta[0,j] - ((alpha / len(X)) * np.sum(term))

    theta = temp
    costs[i] = cost(theta,X,y)

  return theta, costs

TNC:

import scipy.optimize as opt
result = opt.fmin_tnc(func=cost, x0=theta, fprime=gradient, args=(X, y))
result

直接用梯度下降算出来的 theta 效果极差,cost 从 0.69 降到了 0.60,最后预测也只有 60%的正确率。

④ 数据处理
#使用梯度下降前,需要给数据加一个“1”列,方便矩阵乘法
data.insert(0, 'Ones', 1)


cols = data.shape[1]
X = data.iloc[:,0:cols-1]
y = data.iloc[:,cols-1:cols]


X = np.array(X.values)
y = np.array(y.values)
theta = np.zeros(3)
⑤ 正则化
代价函数
#比原来的代价函数多了一个惩罚项,python代码
def costReg(theta, X, y, learningRate):
    theta = np.matrix(theta)
    X = np.matrix(X)
    y = np.matrix(y)
    first = np.multiply(-y, np.log(sigmoid(X * theta.T)))
    second = np.multiply((1 - y), np.log(1 - sigmoid(X * theta.T)))
    reg = (learningRate / (2 * len(X))) * np.sum(np.power(theta[:,1:theta.shape[1]], 2))
    return np.sum(first - second) / len(X) + reg
梯度下降
def gradientReg(theta, X, y, learningRate):
    theta = np.matrix(theta)
    X = np.matrix(X)
    y = np.matrix(y)

    parameters = int(theta.ravel().shape[1])
    grad = np.zeros(parameters)

    error = sigmoid(X * theta.T) - y

    for i in range(parameters):
        term = np.multiply(error, X[:,i])

        if (i == 0):
            grad[i] = np.sum(term) / len(X)
        else:
            grad[i] = (np.sum(term) / len(X)) + ((learningRate / len(X)) * theta[:,i])

    return grad

因为 data2 数据决策边界是非线性的,所以答案将多项式加到 11 次

degree = 5
x1 = data2['Test 1']
x2 = data2['Test 2']

data2.insert(3, 'Ones', 1)

for i in range(1, degree):
    for j in range(0, i):
        data2['F' + str(i) + str(j)] = np.power(x1, i-j) * np.power(x2, j)

data2.drop('Test 1', axis=1, inplace=True)
data2.drop('Test 2', axis=1, inplace=True)

data2.head()

网上另一个版本也用不同的方式加到了 6 次

def mapFeature(x1,x2,degree):
    """
    take in numpy array of x1 and x2, return all polynomial terms up to the given degree
    """
    out = np.ones(len(x1)).reshape(len(x1),1)
    for i in range(1,degree+1):
        for j in range(i+1):
            terms= (x1**(i-j) * x2**j).reshape(len(x1),1)
            out= np.hstack((out,terms))
    return out
X = mapFeature(X[:,0], X[:,1],6)

The mapFeature function also adds a column of ones to X so we do not have to deal with it later on. Here, I decided to use np.hstack instead of np.append to add a new column to the numpy array. I found np.hstack to be much neater in the code compared to np.append that I normally used. Here, I allow degree as a parameter instead of fixing it to 6 like how it was done in the assignment, feel free to play around with different degree and compare the result.


文章作者: Muryor
版权声明: 本博客所有文章除特別声明外,均采用 CC BY 4.0 许可协议。转载请注明来源 Muryor !
评论
  目录