吴恩达机器学习编程练习1：线性回归（python）

— tags: “机器学习” —

一：返回一个5阶单位矩阵

import numpy as np 
def warmupExercise():
	E5=np.eye(5)
	print('这是一个五阶单位矩阵')
	print(E5)

warmupExercise()

二：线性回归

1.含有一个变量，大意是：假如你是一个饭店老板，要在其他城市拓展业务，现有数据在ex1data.txt第一列是人口，第二列是收益
导包

1
2
3

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

将数据读取，进行展示

data = pd.read_csv('ex1data1.txt',names=['Population','Profit'])
data.describe()
data.plot(x='Population',y='Profit',kind='scatter')
plt.show()

1	data.describe()

定义损失函数：

#代价函数
    # * 在matrix类型中是矩阵的叉乘，multiply是对应元素相乘
    # * 在ndarray类型中，dot或 @ 是叉乘，* 是对应元素相乘
def computeCost(X,y,theta):
    inner = np.power(((X*theta.T) - y ),2)
    return np.sum(inner)/(2*len(X))

能够直接矩阵相乘，增加一列1

1 2	#增加x0 data.insert(0,'Ones',1)

将数据分割出来，0-1列是变量x，2列是y

cols = data.shape[1]
print(cols)
X = data.iloc[:,0:cols-1]
y = data.iloc[:,cols-1:cols]

X.head()

y.head()

转化成matrix类型

1
2
3

X = np.matrix(X.values)
y = np.matrix(y.values)
theta = np.matrix([0,0])

1	X.shape,y.shape,theta.shape

计算代价函数

1	computeCost(X,y,theta)

设置梯度下降
公式：

def gradientDescent(X, y, theta, alpha, epoch):
    """reuturn theta, cost"""
    
    temp = np.matrix(np.zeros(theta.shape))  # 初始化一个 θ 临时矩阵(1, 2)
    parameters = int(theta.flatten().shape[1])  # 参数 θ的数量
    cost = np.zeros(epoch)  # 初始化一个ndarray，包含每次epoch的cost
    m = X.shape[0]  # 样本数量m

    for i in range(epoch):
        # 利用向量化一步求解
        temp =theta - (alpha / m) * (X * theta.T - y).T * X
# 以下是不用Vectorization求解梯度下降
#         error = (X * theta.T) - y  # (97, 1)
        
#         for j in range(parameters):
#             term = np.multiply(error, X[:,j])  # (97, 1)
#             temp[0,j] = theta[0,j] - ((alpha / m) * np.sum(term))  # (1,1)        

            
        theta = temp
        cost[i] = computeCost(X, y, theta)
        
    return theta, cost

设置学习率和迭代次数

1 2	alpha = 0.01 epoch = 1000

1	final_theta,cost = gradientDescent(X,y,theta,alpha,epoch)

计算最后的损失

1	computeCost(X ,y ,final_theta)

绘制线性模型以及数据，直观地看出它的拟合。

np.linspace()在指定的间隔内返回均匀间隔的数字。

x = np.linspace(data.Population.min(),data.Population.max(),100)
f = final_theta[0,0] + (final_theta[0,1]*x) #预测值
fig, ax = plt.subplots(figsize=(6,4))
ax.plot(x, f,'r', label = 'Prediction')
ax.scatter(data['Population'],data.Profit,label='Traing Data')
ax.legend(loc=2)
ax.set_xlabel('Population')
ax.set_ylabel('Profit')
ax.set_title('Predicted Profit vs. Population Size')
plt.show()

将cost绘制出来

fig, ax = plt.subplots(figsize=(8,4))
ax.plot(np.arange(epoch),cost,'r')
ax.set_xlabel('Iterations')
ax.set_ylabel('Cost')
ax.set_title('Error vs. Training Epoch')
plt.show()

2.多个变量：ex2data.txt第一列房子大小，第二列房子卧室数量，第三列房子价格。预测房价。

1
2
3

path = 'ex1data2.txt'
data2 = pd.read_csv(path, names=['Size', 'Bedrooms','Price'])
data2.head()

预处理步骤 - 特征归一化

1 2	data2 = (data2 - data2.mean())/data2.std() data2.head()

# add ones column
data2.insert(0, 'Ones', 1)

# set X (training data) and y (target variable)
cols = data2.shape[1]
X2 = data2.iloc[:,0:cols-1]
y2 = data2.iloc[:,cols-1:cols]

# convert to matrices and initialize theta
X2 = np.matrix(X2.values)
y2 = np.matrix(y2.values)
theta2 = np.matrix(np.array([0,0,0]))

# perform linear regression on the data set
g2, cost2 = gradientDescent(X2, y2, theta2, alpha, epoch)

# get the cost (error) of the model
computeCost(X2, y2, g2), g2

绘制代价函数

fig, ax = plt.subplots(figsize=(12,8))
ax.plot(np.arange(epoch),cost2,'r')
ax.set_xlabel('Iterations')
ax.set_ylabel('Cost')
ax.set_title('Error vs. Training Epoch')
plt.show()

利用sklearn自带的线性回归

1
2
3

from sklearn import linear_model
model = linear_model.LinearRegression()
model.fit(X,y)

x =np.array(X[:,1].A1)
f = model.predict(X).flatten()

fig,ax = plt.subplots(figsize=(8,5))
ax.plot(x, f , 'r',label='Prediction')
ax.scatter(data.Population,data.Profit,label='Traning Data')
ax.legend(loc=2)
ax.set_xlabel('Population')
ax.set_ylabel('Profit')
ax.set_title('Predicted Profit vs. Population Size')
plt.show()

直接求解的方法

#正规方程
def normalEpn(X,y):
    theta = np.linalg.inv(X.T@X)@X.T@y
    return theta

1 2	final_theta2 = normalEpn(X,y) final_theta