数据可视化
① 读取数据
一般在读数据的时候加上列标题,所以事先因该大致检查下数据的形式。
path = 'ex2data1.txt'
data = pd.read_csv(path,header=None,names = ['exam1','exam2','admitted'])
data.head()
data.describe() #对数据中每一列数进行统计分析
② 画图
画图可以用 DataFrame.plot ,也可以直接用 matplotlib , 做作业的时候发现答案基本都用的后者。
我的做法:
#筛选出admitted和unadmitted
admitted = data.loc[data.values[:,2] == 1]
unadmitted = data.loc[data.values[:,2] == 0]
#用DataFrame.plot画的图,看了答案后,觉得自己麻烦了
ax = admitted.plot.scatter(x='exam1', y='exam2', color='b', marker='+',label='admitted')
unadmitted.plot.scatter(x='exam1', y='exam2',color='y', marker="o", label='unadmitted',ax=ax)
plt.xlabel("Exam 1 score")
plt.ylabel("Exam 2 score")
plt.show()
答案:
positive = data[data['Admitted'].isin([1])]
negative = data[data['Admitted'].isin([0])]
fig, ax = plt.subplots(figsize=(12,8)) #做完线性回归和逻辑回归后,发现这几部基本是画图通用步骤
ax.scatter(positive['Exam 1'], positive['Exam 2'], s=50, c='b', marker='o', label='Admitted')
ax.scatter(negative['Exam 1'], negative['Exam 2'], s=50, c='r', marker='x', label='Not Admitted')
ax.legend()
ax.set_xlabel('Exam 1 Score')
ax.set_ylabel('Exam 2 Score')
plt.show()
逻辑回归
① 逻辑函数

#python代码
import numpy as np
def sigmoid(z):
return 1 / (1 + np.exp(-z))
② 代价函数

#python代码
import numpy as np
def cost(theta, X, y):
theta = np.matrix(theta)
X = np.matrix(X)
y = np.matrix(y)
first = np.multiply(-y, np.log(sigmoid(X* theta.T)))
second = np.multiply((1 - y), np.log(1 - sigmoid(X* theta.T)))
return np.sum(first - second) / (len(X))
③ 梯度下降

#根据第一节课作业答案改写的,但是后来发现作业中用了SciPy's truncated newton寻找最优参数
def gradienDescent(X,y,theta,alpha,iters):
theta = np.matrix(theta)
temp = np.matrix(np.zeros(theta.shape))
#parameters计算有多少个数
parameters = int(theta.ravel().shape[1])
costs = np.zeros(iters)
for i in range(iters):
error = sigmoid(X * theta.T) -y
for j in range(parameters):
term = np.multiply(error,X[:,j])
temp[0,j] = theta[0,j] - ((alpha / len(X)) * np.sum(term))
theta = temp
costs[i] = cost(theta,X,y)
return theta, costs
TNC:
import scipy.optimize as opt
result = opt.fmin_tnc(func=cost, x0=theta, fprime=gradient, args=(X, y))
result
直接用梯度下降算出来的 theta 效果极差,cost 从 0.69 降到了 0.60,最后预测也只有 60%的正确率。
④ 数据处理
#使用梯度下降前,需要给数据加一个“1”列,方便矩阵乘法
data.insert(0, 'Ones', 1)
cols = data.shape[1]
X = data.iloc[:,0:cols-1]
y = data.iloc[:,cols-1:cols]
X = np.array(X.values)
y = np.array(y.values)
theta = np.zeros(3)
⑤ 正则化
代价函数

#比原来的代价函数多了一个惩罚项,python代码
def costReg(theta, X, y, learningRate):
theta = np.matrix(theta)
X = np.matrix(X)
y = np.matrix(y)
first = np.multiply(-y, np.log(sigmoid(X * theta.T)))
second = np.multiply((1 - y), np.log(1 - sigmoid(X * theta.T)))
reg = (learningRate / (2 * len(X))) * np.sum(np.power(theta[:,1:theta.shape[1]], 2))
return np.sum(first - second) / len(X) + reg
梯度下降

def gradientReg(theta, X, y, learningRate):
theta = np.matrix(theta)
X = np.matrix(X)
y = np.matrix(y)
parameters = int(theta.ravel().shape[1])
grad = np.zeros(parameters)
error = sigmoid(X * theta.T) - y
for i in range(parameters):
term = np.multiply(error, X[:,i])
if (i == 0):
grad[i] = np.sum(term) / len(X)
else:
grad[i] = (np.sum(term) / len(X)) + ((learningRate / len(X)) * theta[:,i])
return grad
因为 data2 数据决策边界是非线性的,所以答案将多项式加到 11 次
degree = 5
x1 = data2['Test 1']
x2 = data2['Test 2']
data2.insert(3, 'Ones', 1)
for i in range(1, degree):
for j in range(0, i):
data2['F' + str(i) + str(j)] = np.power(x1, i-j) * np.power(x2, j)
data2.drop('Test 1', axis=1, inplace=True)
data2.drop('Test 2', axis=1, inplace=True)
data2.head()
网上另一个版本也用不同的方式加到了 6 次
def mapFeature(x1,x2,degree):
"""
take in numpy array of x1 and x2, return all polynomial terms up to the given degree
"""
out = np.ones(len(x1)).reshape(len(x1),1)
for i in range(1,degree+1):
for j in range(i+1):
terms= (x1**(i-j) * x2**j).reshape(len(x1),1)
out= np.hstack((out,terms))
return out
X = mapFeature(X[:,0], X[:,1],6)
The
mapFeature
function also adds a column of ones to X so we do not have to deal with it later on. Here, I decided to usenp.hstack
instead ofnp.append
to add a new column to the numpy array. I foundnp.hstack
to be much neater in the code compared tonp.append
that I normally used. Here, I allow degree as a parameter instead of fixing it to 6 like how it was done in the assignment, feel free to play around with different degree and compare the result.