import os
import sys
sys.path.insert(0, os.path.abspath('..'))

# 선형대수 모듈로부터 Vector 자료형과 dot 함수 불러오기
from scratch.linear_algebra import Vector, dot


def sum_of_squares(v: Vector) -> float:
    """
    v 벡터에 포함된 원소들의 제곱의 합 계산
    """
    return dot(v, v)


def square(x: float) -> float:
    return x * x


def derivative(x: float) -> float:
    return 2 * x


from typing import Callable

def difference_quotient(f: Callable[[float], float],
                        x: float,
                        h: float) -> float:
    """
    함수 f의 x에서의 미분값 근사치 계산
    f: 미분 대상 함수
    x: 인자
    h: x가 변하는 정도
    """
    
    return (f(x + h) - f(x)) / h


import matplotlib.pyplot as plt

h = 0.001
xs = range(-10, 11)

actuals = [derivative(x) for x in xs]
estimates = [difference_quotient(square, x, h) for x in xs]

plt.title("Actual Derivatives vs. Estimates")
# 실제 도함수 그래프(빨간색 점)
plt.plot(xs, actuals, 'r.', label='Actual') 
# 근사치 그래프(검은색 +)
plt.plot(xs, estimates, 'k+', label='Estimates')
plt.legend()
plt.show()


def partial_difference_quotient(f: Callable[[Vector], float],
                                v: Vector,
                                i: int,
                                h: float) -> float:
    """
    함수 f의 v에서의 i번째 편미분값 근사치 계산
    f: 편미분 대상 함수
    v: 인자 벡터
    i: i번째 인자를 가리킴
    h: 인자 v_i가 변하는 정도
    """
    
    # v_i에 대해서만 h 더한 벡터
    w = [v_j + (h if j == i else 0) for j, v_j in enumerate(v)]

    return (f(w) - f(v)) / h


def estimate_gradient(f: Callable[[Vector], float],
                      v: Vector,
                      h: float = 0.0001):
    return [partial_difference_quotient(f, v, i, h) for i in range(len(v))]


def sum_of_squares_gradient(v: Vector) -> Vector:
    return [2 * v_i for v_i in v]


import random
from scratch.linear_algebra import distance, add, scalar_multiply

# v에서의 그레이디언트를 구한 후 스텝이 지정한 크기 비율과 방향으로 이동한 새로운 벡터 v'을 계산한다.
def gradient_step(v: Vector, gradient: Vector, step_size: float) -> Vector:
    step = scalar_multiply(step_size, gradient)
    return add(v, step)


# 임의의 지점 선택
random.seed(42)
v = [random.uniform(-10, 10) for i in range(3)]

# gradient_step 1000번 반복
for epoch in range(1000):
    grad = sum_of_squares_gradient(v)
    v = gradient_step(v, grad, -0.01)
    if epoch%100 == 0:
        print(epoch, v)

print("\n----\n")        
print(f"그레이디언트의 최종 값: {grad}")
print(f"v의 최후 위치와 최솟점 사이의 거리: {distance(v, [0, 0, 0])}")

0 [2.732765249774521, -9.309789197635727, -4.409425359965263]
100 [0.3624181137897112, -1.2346601088642208, -0.5847760329896551]
200 [0.048063729299005625, -0.16374007531854073, -0.07755273779298358]
300 [0.006374190434279768, -0.02171513607091833, -0.010285009644527733]
400 [0.0008453423045827667, -0.002879851701919325, -0.0013639934114305214]
500 [0.00011210892101281368, -0.00038192465375128993, -0.00018089220046728499]
600 [1.4867835316559317e-05, -5.065067796575345e-05, -2.3989843290795995e-05]
700 [1.971765716798422e-06, -6.717270417586384e-06, -3.1815223632100903e-06]
800 [2.6149469369030636e-07, -8.908414196052704e-07, -4.2193208287814765e-07]
900 [3.467931014604295e-08, -1.1814299344070243e-07, -5.5956445449048146e-08]

----

그레이디언트의 최종 값: [9.57758165411209e-09, -3.262822016281278e-08, -1.5453808714914104e-08]
v의 최후 위치와 최솟점 사이의 거리: 1.830234305038648e-08


# x는 -0.5에서 0.5 사이
xs = [x/100 for x in range(-50, 50)]

# 약간의 잡음 추가 (가우시안 잡음)
error = [random.randrange(-100,100)/100 for _ in range(-50, 50)]

# y = 20*x + 5 + 가우시안 잡음
ys = [20*x + 5 + e for x, e in zip(xs, error)]

# (x,y) 좌표값들의 리스트
inputs = list(zip(xs, ys))


plt.plot(xs, ys, 'r.')
plt.show()


def linear_gradient(x: float, y: float, theta: Vector) -> Vector:
    # 기울기와 절편
    slope, intercept = theta
    # 예측치
    predicted = slope * x + intercept
    # 오차
    error = (predicted - y)          
    # 제곱 오차
    squared_error = error ** 2       
    # 특정 x에 대한 제곱오차의 그레이디언트 항목
    grad = [2 * error * x, 2 * error]

    return grad


from scratch.linear_algebra import vector_mean

# 임의의 기울기와 절편으로 시작
theta = [random.uniform(-1, 1), random.uniform(-1, 1)]

learning_rate = 0.001

for epoch in range(5000):
    # 평균 제곱 오차 계산 (전체 훈련 데이터 대상)
    grad = vector_mean([linear_gradient(x, y, theta) for x, y in inputs])
    # theta 값 업데이트. 그레이디언트 반대 방향으로 지정된 학습률 비율로 이동
    theta = gradient_step(theta, grad, -learning_rate)
    # 500번에 한 번 학습과정 확인
    if epoch % 500 == 0:
        print(epoch, theta)

slope, intercept = theta

print(f"최종 기울기: {slope:.3f}")
print(f"최종 절편: {intercept:.3f}")

0 [-0.5573631781037754, -0.3409606698035382]
500 [1.0860605634068832, 2.9677764519232985]
1000 [2.6080336786406573, 4.18874863946214]
1500 [4.011976376077862, 4.642064339793808]
2000 [5.305013852497387, 4.812897015495151]
2500 [6.495163711496766, 4.879577951303121]
3000 [7.5903398484398314, 4.90767120517886]
3500 [8.598020889846392, 4.921296576290668]
4000 [9.525160097963871, 4.929341055389218]
4500 [10.378181475323187, 4.9350917239397925]
최종 기울기: 11.161
최종 절편: 4.940


from scratch.linear_algebra import vector_mean

# 임의의 기울기와 절편으로 시작
theta = [random.uniform(-1, 1), random.uniform(-1, 1)]

learning_rate = 0.001

for epoch in range(20000):
    # 평균 제곱 오차 계산 (전체 훈련 데이터 대상)
    grad = vector_mean([linear_gradient(x, y, theta) for x, y in inputs])
    # theta 값 업데이트. 그레이디언트 반대 방향으로 지정된 학습률 비율로 이동
    theta = gradient_step(theta, grad, -learning_rate)
    # 1000번에 한 번 학습과정 확인
    if epoch % 1000 == 0:
        print(epoch, theta)

slope, intercept = theta

print(f"최종 기울기: {slope:.3f}")
print(f"최종 절편: {intercept:.3f}")

0 [0.539845326196926, -0.8765658904200774]
1000 [3.5347058588398195, 4.120657569741477]
2000 [6.089146831484904, 4.807295221174362]
3000 [8.254060613242245, 4.909957251324375]
4000 [10.08698653375398, 4.932225333575831]
5000 [11.63858380675949, 4.942345701376079]
6000 [12.951998593477466, 4.949733516900191]
7000 [14.063789219509484, 4.955827987429306]
8000 [15.0049066769235, 4.960965378519356]
9000 [15.801551256517898, 4.965311213690506]
10000 [16.475901275838954, 4.968989518433547]
11000 [17.046730424738943, 4.972103106006536]
12000 [17.529930406587045, 4.974738713135767]
13000 [17.938953356610103, 4.976969721779518]
14000 [18.285186344872766, 4.978858243526626]
15000 [18.57826838869039, 4.980456854364114]
16000 [18.826358799830466, 4.981810059132508]
17000 [19.036364337180647, 4.982955530628767]
18000 [19.21413148874078, 4.983925158418778]
19000 [19.36460923600656, 4.984745936634814]
최종 기울기: 19.492
최종 절편: 4.985


from scratch.linear_algebra import vector_mean

# 임의의 기울기와 절편으로 시작
theta = [random.uniform(-1, 1), random.uniform(-1, 1)]

learning_rate = 0.01

for epoch in range(5000):
    # 평균 제곱 오차 계산 (전체 훈련 데이터 대상)
    grad = vector_mean([linear_gradient(x, y, theta) for x, y in inputs])
    # theta 값 업데이트. 그레이디언트 반대 방향으로 지정된 학습률 비율로 이동
    theta = gradient_step(theta, grad, -learning_rate)
    # 500번에 한 번 학습과정 확인
    if epoch % 500 == 0:
        print(epoch, theta)

slope, intercept = theta

print(f"최종 기울기: {slope:.3f}")
print(f"최종 절편: {intercept:.3f}")

0 [0.6757583873253522, 0.6957201300303815]
500 [11.706690051576645, 4.942804228398626]
1000 [16.50780846999213, 4.969163560245711]
1500 [18.593137756203685, 4.98053795906695]
2000 [19.498884502023184, 4.9854783387029045]
2500 [19.89228863870891, 4.98762415462158]
3000 [20.0631607014304, 4.988556173271237]
3500 [20.137377668401463, 4.988960988407408]
4000 [20.16961323750889, 4.9891368167500385]
4500 [20.18361450915995, 4.9892131864390965]
최종 기울기: 20.190
최종 절편: 4.989


# 예측치
zs = [slope *x + intercept for x in xs]

# 실제 데이터 분포
plt.plot(xs, ys, 'r.', label='Actuals')
# 예측치 그래프
plt.plot(xs, zs, 'b-', label='Estimates')

plt.title("Linear Regression")
plt.legend()

plt.show()


from typing import List, Iterator

# 제너레이터 함수 정의
def minibatches(dataset: List[float],
                batch_size: int,
                shuffle: bool = True) -> Iterator[List[float]]:
    """
    dataset: 전체 데이터셋
    batch_size: 미니배치 크기
    shuffle: 섞기 옵션
    리턴값: 이터레이터
    """

    # 0번 인덱스부터 시작하여, batch_size 배수 번째에 해당하는 인덱스만 선택
    batch_starts = [start for start in range(0, len(dataset), batch_size)]
    
    # shuffle 옵션이 참이면 인덱스 섞기
    if shuffle: random.shuffle(batch_starts)

    # batch_starts에  포함된 인덱스를 기준으로 해서 미니배치 크기만큼씩 선택해서 
    # 다음 MSE와 그레이디언트 계산에 필요한 훈련 데이터 세트를 지정함.
    for start in batch_starts:
        end = start + batch_size
        yield dataset[start:end]


# 임의의 기울기와 절편으로 시작
theta = [random.uniform(-1, 1), random.uniform(-1, 1)]

# 학습률 지정
learning_rate = 0.001

# 1000번의 에포크
for epoch in range(1000):
    # 미니배치의 크기를 20으로 지정함
    # 따라서 한 번의 에포크마다 5번 MSE와 그레이디언트 계산 후 기울기와 절편 업데이트
    # 섞기 옵션 사용
    for batch in minibatches(inputs, batch_size=20):
        grad = vector_mean([linear_gradient(x, y, theta) for x, y in batch])
        theta = gradient_step(theta, grad, -learning_rate)
    # 100개의 에포크가 지날 때마다 학습 내용 출력
    if epoch % 100 == 0:
        print(epoch, theta)

slope, intercept = theta

print(f"최종 기울기: {slope:.3f}")
print(f"최종 절편: {intercept:.3f}")

0 [-0.18096617572366355, -0.810253155496382]
100 [1.4325823295978932, 2.7965832049687727]
200 [2.9277291181808662, 4.127583024520555]
300 [4.307256114408177, 4.6201205624228345]
400 [5.577858834315078, 4.805372329280058]
500 [6.747297927532496, 4.877973082222024]
600 [7.823358363323769, 4.9075106536945095]
700 [8.813363542511828, 4.9227014341159645]
800 [9.724173605813512, 4.930098309332715]
900 [10.562099366999867, 4.935898279024058]
최종 기울기: 11.326
최종 절편: 4.940


# 임의의 기울기와 절편으로 시작
theta = [random.uniform(-1, 1), random.uniform(-1, 1)]

# 학습률 지정
learning_rate = 0.01

# 1000번의 에포크
for epoch in range(1000):
    # 미니배치의 크기를 20으로 지정함
    # 따라서 한 번의 에포크마다 5번 MSE와 그레이디언트 계산 후 기울기와 절편 업데이트
    # 섞기 옵션 사용
    for batch in minibatches(inputs, batch_size=20):
        grad = vector_mean([linear_gradient(x, y, theta) for x, y in batch])
        theta = gradient_step(theta, grad, -learning_rate)
    # 100개의 에포크가 지날 때마다 학습 내용 출력
    if epoch % 100 == 0:
        print(epoch, theta)

slope, intercept = theta

print(f"최종 기울기: {slope:.3f}")
print(f"최종 절편: {intercept:.3f}")

0 [1.0089547406154515, 0.9789387260040882]
100 [11.920008658960944, 4.952894140709578]
200 [16.630466145513115, 4.970879458498564]
300 [18.65959750842576, 4.983489508744]
400 [19.533544758414724, 4.986114887570471]
500 [19.910159234650248, 4.9876175552247615]
600 [20.072120246973697, 4.988277830931272]
700 [20.141813699267466, 4.989044796751943]
800 [20.171950036505514, 4.989673862726055]
900 [20.18490980813245, 4.988704425280094]
최종 기울기: 20.190
최종 절편: 4.989


# 임의의 기울기와 절편으로 시작
theta = [random.uniform(-1, 1), random.uniform(-1, 1)]

# 학습률 지정
learning_rate = 0.01

# 3000번의 에포크
for epoch in range(3000):
    # 미니배치의 크기를 20으로 지정함
    # 따라서 한 번의 에포크마다 5번 MSE와 그레이디언트 계산 후 기울기와 절편 업데이트
    # 섞기 옵션 사용
    for batch in minibatches(inputs, batch_size=20):
        grad = vector_mean([linear_gradient(x, y, theta) for x, y in batch])
        theta = gradient_step(theta, grad, -learning_rate)
    # 100개의 에포크가 지날 때마다 학습 내용 출력
    if epoch % 100 == 0:
        print(epoch, theta)

slope, intercept = theta

print(f"최종 기울기: {slope:.3f}")
print(f"최종 절편: {intercept:.3f}")

0 [-0.818987279044814, 0.5496759496355055]
100 [11.132254675200233, 4.944402547449223]
200 [16.29128890751974, 4.967275599762484]
300 [18.51315652813586, 4.978364613062517]
400 [19.470399111844596, 4.984337308055089]
500 [19.882668827278895, 4.988157388454654]
600 [20.060140900004065, 4.987966154755475]
700 [20.136885552244106, 4.987828522542775]
800 [20.16977510712028, 4.988976100905835]
900 [20.183962972393914, 4.989098169073801]
1000 [20.19002718033645, 4.989306092251666]
1100 [20.19268900836928, 4.989013649684174]
1200 [20.193941008879644, 4.98931327368746]
1300 [20.194440751083505, 4.988662935916931]
1400 [20.194676502572356, 4.989409414992472]
1500 [20.19501205890768, 4.989279358430207]
1600 [20.194844639671665, 4.9895094830280255]
1700 [20.195038900604747, 4.989915517219994]
1800 [20.195104991373057, 4.988570101328778]
1900 [20.195111383724566, 4.989401145459436]
2000 [20.194820820191822, 4.988704721391355]
2100 [20.194956704659234, 4.989706094313501]
2200 [20.194893907702024, 4.989209975413154]
2300 [20.19482708931829, 4.989765252131663]
2400 [20.19497162191007, 4.990361301343332]
2500 [20.19504959681753, 4.989715152995273]
2600 [20.195037903439133, 4.989885179868943]
2700 [20.195048938410093, 4.989683544147567]
2800 [20.194794217945287, 4.988956630961846]
2900 [20.19472935840392, 4.9896646884885545]
최종 기울기: 20.194
최종 절편: 4.989


theta = [random.uniform(-1, 1), random.uniform(-1, 1)]

# 학습률
learning_rate = 0.001

# 에포크는 1000
for epoch in range(1000):
    for x, y in inputs:
        grad = linear_gradient(x, y, theta)
        theta = gradient_step(theta, grad, -learning_rate)
    if epoch % 100 == 0:
        print(epoch, theta)

slope, intercept = theta

print(f"최종 기울기: {slope:.3f}")
print(f"최종 절편: {intercept:.3f}")

0 [-0.15554810709612274, 1.509287732512692]
100 [16.342595331414632, 5.034998314265212]
200 [19.472338990801568, 4.997992330602081]
300 [20.06303054508488, 4.991008012136133]
400 [20.174514599453477, 4.989689828163571]
500 [20.19555552053715, 4.989441040970666]
600 [20.199526674641238, 4.989394086168721]
700 [20.200276169630307, 4.98938522416341]
800 [20.200417625419696, 4.9893835515945835]
900 [20.200444323050316, 4.989383235922631]
최종 기울기: 20.200
최종 절편: 4.989


theta = [random.uniform(-1, 1), random.uniform(-1, 1)]

# 학습률
learning_rate = 0.01

# 에포크는 1000
for epoch in range(1000):
    for x, y in inputs:
        grad = linear_gradient(x, y, theta)
        theta = gradient_step(theta, grad, -learning_rate)
    if epoch % 100 == 0:
        print(epoch, theta)

slope, intercept = theta

print(f"최종 기울기: {slope:.3f}")
print(f"최종 절편: {intercept:.3f}")

0 [2.259915006214208, 6.8246196207535865]
100 [20.261030045940156, 4.990193171642614]
200 [20.26103222379059, 4.990192825899716]
300 [20.26103222379092, 4.990192825899663]
400 [20.26103222379092, 4.990192825899663]
500 [20.26103222379092, 4.990192825899663]
600 [20.26103222379092, 4.990192825899663]
700 [20.26103222379092, 4.990192825899663]
800 [20.26103222379092, 4.990192825899663]
900 [20.26103222379092, 4.990192825899663]
최종 기울기: 20.261
최종 절편: 4.990

경사하강법¶

핵심 주제¶

필수 모듈 불러오기¶

핵심 1: 경사하강법 의미¶

경사하강법 기본 아이디어¶

그레이디언트의 정의와 의미¶

경사하강법 작동 방식¶

주의사항¶

핵심 2: 그레이디언트 계산¶

단변수 함수의 도함수 계산¶

다변수 함수의 그레이디언트 계산¶

주의사항¶

핵심 3: 경사하강법과 선형회귀¶

에포크와 스텝 크기¶

선형회귀¶

목표¶

기준¶

핵심 4: 미니배치/확률적 경사하강법¶

미니배치 경사하강법¶

확률적 경사하강법¶

경사하강법 비교¶