์๋ฃ๋ฅผ ๊ณต๊ฐํ ์ ์ ์ค๋ ๋ฆฌ์ ์ ๋กฑ์๊ฒ ๊น์ ๊ฐ์ฌ๋ฅผ ๋๋ฆฝ๋๋ค. ์ด์ ๋๋ถ์ด ํ๋น๋ฏธ๋์ด๋ก๋ถํฐ ๊ฐ์์ค๋น์ ํ์ํ ์๋ฃ๋ฅผ ์ง์๋ฐ์์์ ๋ฐํ๋ฉฐ, ์ด์ ๋ํด ์ง์ฌ์ด๋ฆฐ ๊ฐ์ฌ๋ฅผ ์ ํฉ๋๋ค.
๊ธ๋ก๋ก ์ด๊ธฐํ(๊ท ๋ฑ๋ถํฌ ํ์ฉ)
$-r$๊ณผ $r$ ์ฌ์ด์ ๊ท ๋ฑ๋ถํฌ
$$r = \sqrt{\frac{3}{\textit{fan}_{\textrm{avg}}}}$$
์ด๊ธฐํ ์ ๋ต | ํ์ฑํ ํจ์ | ์ ๊ท๋ถํฌ ์ด๊ธฐํ | ๊ท ๋ฑ๋ถํฌ ์ด๊ธฐํ |
---|---|---|---|
Glorot ์ด๊ธฐํ | ํ์ฑํ ํจ์ ์๋ ๊ฒฝ์ฐ, ํ์ดํผ๋ณผ๋ฆญ ํ์ ํธ, ๋ก์ง์คํฑ, ์ํํธ๋งฅ์ค | glorot_normal | glorot_uniform |
He ์ด๊ธฐํ | ReLU ํจ์์ ๊ทธ ๋ณ์ข ๋ค | he_normal | he_uniform |
LeCun ์ด๊ธฐํ | SELU | lecun_normal | lecun_uniform |
glorot_uniform
keras.layers.Dense(10, activation="relu", kernel_initializer="he_normal")
VarianceScaling
ํด๋์ค ํ์ฉinit = keras.initializers.VarianceScaling(scale=2., mode='fan_avg',
distribution='uniform')
keras.layers.Dense(10, activation="relu", kernel_initializer=init)
2010๋ ๋์ ์๊ฐ๋จ.
$\textrm{ReLU}_\alpha(z) = \max(0, z)$
์๋ฒฝํ์ง ์์
์ ๋ ฅ์ ๊ฐ์ค์น ํฉ์ด ์์๊ฐ ๋๋ฉด ๋ด๋ฐ์ด ์ฃฝ๊ฒ ๋์ด ๊ฒฝ์ฌํ๊ฐ๋ฒ์ด ์ ๋๋ก ์๋ํ์ง ์๊ฒ๋จ.
์๋ ์กฐ๊ฑด ํ์์ ๋ฐ์ด๋ ์ฑ๋ฅ ๋ฐํํจ
model = keras.models.Sequential([
keras.layers.Flatten(input_shape=[28, 28]),
keras.layers.Dense(300, kernel_initializer="he_normal"),
keras.layers.LeakyReLU(),
keras.layers.Dense(100, kernel_initializer="he_normal"),
keras.layers.LeakyReLU(),
keras.layers.Dense(10, activation="softmax")
])
model = keras.models.Sequential([
keras.layers.Flatten(input_shape=[28, 28]),
keras.layers.Dense(300, kernel_initializer="he_normal"),
keras.layers.PReLU(),
keras.layers.Dense(100, kernel_initializer="he_normal"),
keras.layers.PReLU(),
keras.layers.Dense(10, activation="softmax")
])
activation="selu"
์ kernel_initializer="lecun_normal"
์ง์ .model = keras.models.Sequential()
model.add(keras.layers.Flatten(input_shape=[28, 28]))
model.add(keras.layers.Dense(300, activation="selu",
kernel_initializer="lecun_normal"))
for layer in range(99):
model.add(keras.layers.Dense(100, activation="selu",
kernel_initializer="lecun_normal"))
model.add(keras.layers.Dense(10, activation="softmax"))
model = keras.models.Sequential([
keras.layers.Flatten(input_shape=[28, 28]),
keras.layers.BatchNormalization(),
keras.layers.Dense(300, activation="relu"),
keras.layers.BatchNormalization(),
keras.layers.Dense(100, activation="relu"),
keras.layers.BatchNormalization(),
keras.layers.Dense(10, activation="softmax")
])
model = keras.models.Sequential([
keras.layers.Flatten(input_shape=[28, 28]),
keras.layers.BatchNormalization(),
keras.layers.Dense(300, use_bias=False),
keras.layers.BatchNormalization(),
keras.layers.Activation("relu"),
keras.layers.Dense(100, use_bias=False),
keras.layers.BatchNormalization(),
keras.layers.Activation("relu"),
keras.layers.Dense(10, activation="softmax")
])
momentum
, axis
๊ฐ์ ๋ํด ์ ์์๋ ํ์ ์์.clipvalue
๋๋ clipnorm
์ง์ clipvalue
: ์ง์ ๋ ์๊ณ๊ฐ์ ๋ฒ์ด๋๋ฉด ์๋ผ๋.[0.9, 100]
=> [0.9, 1.0]
optimizer = keras.optimizers.SGD(clipvalue=1.0)
clipnorm
: ์ง์ ๋ ์๊ณ๊ฐ ๋ฒ์๋ด๋ก ์ ์ฒด ๊ฐ์ ์ ํ์ ์ผ๋ก ์ถ์ํจ[0.9, 100]
=> [0.0089964, 0.9999595]
optimizer = keras.optimizers.SGD(clipnorm=1.0)
model_A = keras.models.load_model("my_model_A.h5")
model_B_on_A = keras.models.Sequential(model_A.layers[:-1])
model_B_on_A.add(keras.layers.Dense(1, activation="sigmoid"))
optimizer = keras.optimizers.SGD(lr=0.001, momentum=0.9)
optimizer = keras.optimizers.SGD(lr=0.001, momentum=0.9, nesterov=True)
optimizer = keras.optimizers.Adagrad(lr=0.001)
optimizer = keras.optimizers.RMSprop(lr=0.001, rho=0.9)
optimizer = keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999)
optimizer = keras.optimizers.Adamax(lr=0.001, beta_1=0.9, beta_2=0.999)
optimizer = keras.optimizers.Adamax(lr=0.001, beta_1=0.9, beta_2=0.999)
ํด๋์ค | ์๋ ด ์๋ | ์๋ ด ํ์ง |
---|---|---|
SGD | * | *** |
SGD(momentum=...) | ** | *** |
SGD(momentum=..., nesterov=...) | ** | *** |
Adagrad | *** | * (๋๋ฌด ์ผ์ฐ ๋ฉ์ถค) |
RMSProp | *** | ** ๋๋ *** |
Adam | *** | ** ๋๋ *** |
Nadam | *** | ** ๋๋ *** |
AdaMax | *** | ** ๋๋ *** |
decay
์ต์
์ผ๋ก ์ง์ optimizer = keras.optimizers.SGD(lr=0.01, decay=1e-4)
lr
: ์ด๊ธฐ ํ์ต๋ฅ decay
: ์คํ
์($s$)์ ์ญ์def exponential_decay_fn(epoch):
return 0.01 * 0.1**(epoch / 20)
def exponential_decay(lr0, s):
def exponential_decay_fn(epoch):
return lr0 * 0.1**(epoch / s)
return exponential_decay_fn
exponential_decay_fn = exponential_decay(lr0=0.01, s=20)
fit()
๋ฉ์๋์ ์ ๋ฌ.lr_scheduler = keras.callbacks.LearningRateScheduler(exponential_decay_fn)
history = model.fit(X_train_scaled, y_train, epochs=n_epochs,
validation_data=(X_valid_scaled, y_valid),
callbacks=[lr_scheduler])
on_batch_begin()
๊ณผ on_epoch_end()
๋ฉ์๋ ์ฌ์ ์keras.backend
๋ชจ๋ ํ์ฉ. K = keras.backend
class ExponentialDecay(keras.callbacks.Callback):
def __init__(self, s=40000):
super().__init__()
self.s = s
def on_batch_begin(self, batch, logs=None):
# Note: the `batch` argument is reset at each epoch
lr = K.get_value(self.model.optimizer.lr)
K.set_value(self.model.optimizer.lr, lr * 0.1**(1 / s))
def on_epoch_end(self, epoch, logs=None):
logs = logs or {}
logs['lr'] = K.get_value(self.model.optimizer.lr)
fit()
๋ฉ์๋์ ์ฝ๋ฐฑํจ์๋ก ์ ๋ฌs = 20 * len(X_train) // 32 # 20 ์ํฌํฌ ๋์์ ์คํ
์, ๋ฐฐ์นํฌ๊ธฐ๋ 32
exp_decay = ExponentialDecay(s)
history = model.fit(X_train_scaled, y_train, epochs=n_epochs,
validation_data=(X_valid_scaled, y_valid),
callbacks=[exp_decay])
keras.optimizers.schedules
๋ชจ๋ ํ์ฉ ๊ฐ๋ฅ. (์ ์ ๋ค์ ์ค๋ช
๋จ)def piecewise_constant_fn(epoch):
if epoch < 5:
return 0.01
elif epoch < 15:
return 0.005
else:
return 0.001
def piecewise_constant(boundaries, values):
boundaries = np.array([0] + boundaries)
values = np.array(values)
def piecewise_constant_fn(epoch):
return values[np.argmax(boundaries > epoch) - 1]
return piecewise_constant_fn
piecewise_constant_fn = piecewise_constant([5, 15], [0.01, 0.005, 0.001])
lr_scheduler = keras.callbacks.LearningRateScheduler(piecewise_constant_fn)
history = model.fit(X_train_scaled, y_train, epochs=n_epochs,
validation_data=(X_valid_scaled, y_valid),
callbacks=[lr_scheduler])
factor
๋ฐฐ ๋งํผ ํ์ต๋ฅ ๊ฐ์์ํด.ReduceLROnPlateau
์ฝ๋ฐฑ ํด๋์ค ํ์ฉlr_scheduler = keras.callbacks.ReduceLROnPlateau(factor=0.5, patience=5)
exponential_decay_fun()
ํจ์์ ๋์ผํ ์ง์๊ธฐ๋ฐ ์ค์ผ์ค๋ง ๊ธฐ๋ฅ ์ ๊ณตs = 20 * len(X_train) // 32 # ์ ์ฒด ์คํ
์ ๊ณ์ฐ(์ํฌํฌ 20, ๋ฐฐ์นํฌ๊ธฐ 32)
learning_rate = keras.optimizers.schedules.ExponentialDecay(0.01, s, 0.1)
optimizer = keras.optimizers.SGD(learning_rate)
learning_rate = keras.optimizers.schedules.PiecewiseConstantDecay(
boundaries=[5. * n_steps_per_epoch, 15. * n_steps_per_epoch],
values=[0.01, 0.005, 0.001])
optimizer = keras.optimizers.SGD(learning_rate)
EarlyStopping
์ฝ๋ฐฑ์ ์ฌ์ฉํ์ฌ ์ผ์ ์ํฌํฌ ๋์ ์ฑ๋ฅ์ด ํฅ์๋์ง ์๋ ๊ฒฝ์ฐ ์๋ ์ข
๋ฃ์ํค๊ธฐkernel_regularizer
์ต์
์ฌ์ฉ.layer = keras.layers.Dense(100, activation="elu",
kernel_initializer="he_normal",
kernel_regularizer=keras.regularizers.l2(0.01))
functools.partial()
함수 활용¶from functools import partial
RegularizedDense = partial(keras.layers.Dense,
activation="elu",
kernel_initializer="he_normal",
kernel_regularizer=keras.regularizers.l2(0.01))
model = keras.models.Sequential([
keras.layers.Flatten(input_shape=[28, 28]),
RegularizedDense(300),
RegularizedDense(100),
RegularizedDense(10, activation="softmax")
model = keras.models.Sequential([
keras.layers.Flatten(input_shape=[28, 28]),
keras.layers.Dropout(rate=0.2),
keras.layers.Dense(300, activation="elu", kernel_initializer="he_normal"),
keras.layers.Dropout(rate=0.2),
keras.layers.Dense(100, activation="elu", kernel_initializer="he_normal"),
keras.layers.Dropout(rate=0.2),
keras.layers.Dense(10, activation="softmax")
])
SELU ํ์ฑํํจ์๋ฅผ ์ฌ์ฉํ๋ ๊ฒฝ์ฐ ์ํ(alpha) ๋๋กญ์์ ์ฌ์ฉ ์ถ์ฒ
์ ๋ ฅ๊ณผ ํ๊ท ์ ํ์คํธ์ฐจ๋ฅผ ์ ์ง์์ผ์ค.
์ผ๋ฐ ๋๋กญ์์์ ์๊ธฐ ์ ๊ทํ ๊ธฐ๋ฅ ๋ฐฉํด.
model = keras.models.Sequential([
keras.layers.Flatten(input_shape=[28, 28]),
keras.layers.AlphaDropout(rate=0.2),
keras.layers.Dense(300, activation="selu", kernel_initializer="lecun_normal"),
keras.layers.AlphaDropout(rate=0.2),
keras.layers.Dense(100, activation="selu", kernel_initializer="lecun_normal"),
keras.layers.AlphaDropout(rate=0.2),
keras.layers.Dense(10, activation="softmax")
])
training=True
๋ก ์ง์ ํ์ฌ ๋๋กญ์์ ์ธต์ ํ์ฑํํด์ผ ํจ.y_probas = np.stack([model(X_test_scaled, training=True)
for sample in range(100)])
y_proba = y_probas.mean(axis=0)
y_std = y_probas.std(axis=0)
[10000, 10]
๋ชจ์์ ํ๋ ฌ 100๊ฐ.[100, 10000, 10]
๋ชจ์์ ํ๋ ฌ์ด y_probas
์ ์ ์ฅ๋จ.class MCDropout(keras.layers.Dropout):
def call(self, inputs):
return super().call(inputs, training=True)
class MCAlphaDropout(keras.layers.AlphaDropout):
def call(self, inputs):
return super().call(inputs, training=True)
mc_model = keras.models.Sequential([
MCAlphaDropout(layer.rate) if isinstance(layer, keras.layers.AlphaDropout) else layer
for layer in model.layers
])
kernel_constraint
๋งค๊ฐ๋ณ์ ๊ฐ์ ์๋์ ๊ฐ์ด ์ง์ ํ๋ฉด ๋จ.layer = keras.layers.Dense(100, activation="selu", kernel_initializer="lecun_normal",
kernel_constraint=keras.constraints.max_norm(1.))
max_norm()
ํจ์์ axis
๊ฐ์ ์ ์ ํ๊ฒ ์ง์ ํด์ผ ํจ.ํ์ดํผํ๋ผ๋ฏธํฐ | ๊ธฐ๋ณธ๊ฐ |
---|---|
์ปค๋์ด๊ธฐํ | He ์ด๊ธฐํ |
ํ์ฑํ ํจ์ | ELU |
์ ๊ทํ | ๊น์ ์ ๊ฒฝ๋ง์ผ ๊ฒฝ์ฐ์๋ง ๋ฐฐ์น์ ๊ทํ ์ฌ์ฉ |
๊ท์ | ์กฐ๊ธฐ์ข ๋ฃ, ๊ฒฝ์ฐ์ ๋ฐ๋ผ $\ell_2$ ๊ท์ ์ถ๊ฐ |
์ตํฐ๋ง์ด์ | ๋ชจ๋ฉํ ์ต์ ํ, RMSProp, Nadam ์ค ํ๋ |
ํ์ต๋ฅ ์ค์ผ์ค | 1์ฌ์ดํด |
ํ์ดํผํ๋ผ๋ฏธํฐ | ๊ธฐ๋ณธ๊ฐ |
---|---|
์ปค๋์ด๊ธฐํ | ๋ฅด์ฟค(LeCun) ์ด๊ธฐํ |
ํ์ฑํ ํจ์ | SELU |
์ ๊ทํ | ํ์ ์์ |
๊ท์ | ๊ฒฝ์ฐ์ ๋ฐ๋ผ ์ํ ๋๋กญ์์ |
์ตํฐ๋ง์ด์ | ๋ชจ๋ฉํ ์ต์ ํ, RMSProp, Nadam ์ค ํ๋ |
ํ์ต๋ฅ ์ค์ผ์ค | 1์ฌ์ดํด |