GReaT Example with California Housing Dataset¶
In [1]:
Copied!
# Execute only once!
import os
import sys
sys.path.append("..")
os.chdir("..")
# Execute only once!
import os
import sys
sys.path.append("..")
os.chdir("..")
In [2]:
Copied!
%load_ext autoreload
%autoreload 2
%load_ext autoreload
%autoreload 2
In [3]:
Copied!
import numpy as np
import pandas as pd
import logging
from sklearn import datasets
import numpy as np
import pandas as pd
import logging
from sklearn import datasets
In [4]:
Copied!
from utils import set_logging_level
from be_great import GReaT
from utils import set_logging_level
from be_great import GReaT
In [5]:
Copied!
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
In [6]:
Copied!
logger = set_logging_level(logging.INFO)
logger = set_logging_level(logging.INFO)
Load Data¶
In [7]:
Copied!
data = datasets.fetch_california_housing(as_frame=True).frame
data.head()
data = datasets.fetch_california_housing(as_frame=True).frame
data.head()
Out[7]:
MedInc | HouseAge | AveRooms | AveBedrms | Population | AveOccup | Latitude | Longitude | MedHouseVal | |
---|---|---|---|---|---|---|---|---|---|
0 | 8.3252 | 41.0 | 6.984127 | 1.023810 | 322.0 | 2.555556 | 37.88 | -122.23 | 4.526 |
1 | 8.3014 | 21.0 | 6.238137 | 0.971880 | 2401.0 | 2.109842 | 37.86 | -122.22 | 3.585 |
2 | 7.2574 | 52.0 | 8.288136 | 1.073446 | 496.0 | 2.802260 | 37.85 | -122.24 | 3.521 |
3 | 5.6431 | 52.0 | 5.817352 | 1.073059 | 558.0 | 2.547945 | 37.85 | -122.25 | 3.413 |
4 | 3.8462 | 52.0 | 6.281853 | 1.081081 | 565.0 | 2.181467 | 37.85 | -122.25 | 3.422 |
Create GReaT Model¶
Only one epoch here for demonstration
In [8]:
Copied!
great = GReaT("distilgpt2", # Name of the large language model used (see HuggingFace for more options)
epochs=1, # Number of epochs to train (only one epoch for demonstration)
save_steps=2000, # Save model weights every x steps
logging_steps=50, # Log the loss and learning rate every x steps
experiment_dir="trainer_california", # Name of the directory where all intermediate steps are saved
#lr_scheduler_type="constant", # Specify the learning rate scheduler
#learning_rate=5e-5 # Set the inital learning rate
)
great = GReaT("distilgpt2", # Name of the large language model used (see HuggingFace for more options)
epochs=1, # Number of epochs to train (only one epoch for demonstration)
save_steps=2000, # Save model weights every x steps
logging_steps=50, # Log the loss and learning rate every x steps
experiment_dir="trainer_california", # Name of the directory where all intermediate steps are saved
#lr_scheduler_type="constant", # Specify the learning rate scheduler
#learning_rate=5e-5 # Set the inital learning rate
)
Start Training¶
In [9]:
Copied!
trainer = great.fit(data)
trainer = great.fit(data)
2022-10-13 10:12:58,445 - INFO - Convert data into HuggingFace dataset object... (great.py:99) 2022-10-13 10:12:58,451 - INFO - Create GReaT Trainer... (great.py:104) 2022-10-13 10:13:00,554 - INFO - Start training... (great.py:113) /home/kathrin/miniconda3/envs/transformers/lib/python3.9/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning warnings.warn( ***** Running training ***** Num examples = 20640 Num Epochs = 1 Instantaneous batch size per device = 8 Total train batch size (w. parallel, distributed & accumulation) = 16 Gradient Accumulation steps = 1 Total optimization steps = 1290 You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding. /home/kathrin/miniconda3/envs/transformers/lib/python3.9/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector. warnings.warn('Was asked to gather along dimension 0, but all '
[1290/1290 01:49, Epoch 1/1]
Step | Training Loss |
---|---|
50 | 2.680700 |
100 | 2.031900 |
150 | 1.939400 |
200 | 1.914700 |
250 | 1.875900 |
300 | 1.867900 |
350 | 1.851000 |
400 | 1.836000 |
450 | 1.800000 |
500 | 1.803000 |
550 | 1.806700 |
600 | 1.810800 |
650 | 1.815500 |
700 | 1.797400 |
750 | 1.778400 |
800 | 1.800900 |
850 | 1.793400 |
900 | 1.794900 |
950 | 1.794400 |
1000 | 1.777800 |
1050 | 1.773700 |
1100 | 1.769200 |
1150 | 1.781900 |
1200 | 1.772400 |
1250 | 1.769500 |
Training completed. Do not forget to share your model on huggingface.co/models =)
In [10]:
Copied!
loss_hist = trainer.state.log_history.copy()
loss_hist.pop()
loss_hist = trainer.state.log_history.copy()
loss_hist.pop()
Out[10]:
{'train_runtime': 111.2294, 'train_samples_per_second': 185.562, 'train_steps_per_second': 11.598, 'total_flos': 464022201434112.0, 'train_loss': 1.854305208191391, 'epoch': 1.0, 'step': 1290}
In [11]:
Copied!
loss = [x["loss"] for x in loss_hist]
epochs = [x["epoch"] for x in loss_hist]
loss = [x["loss"] for x in loss_hist]
epochs = [x["epoch"] for x in loss_hist]
In [12]:
Copied!
plt.plot(epochs, loss)
plt.plot(epochs, loss)
Out[12]:
[<matplotlib.lines.Line2D at 0x7f32cb9e6340>]
Save Model¶
In [ ]:
Copied!
great.save("california")
great.save("california")
Load Model¶
In [ ]:
Copied!
# great = GReaT.load_from_dir("california")
# great = GReaT.load_from_dir("california")
In [13]:
Copied!
great.load_finetuned_model("../great_private/models/california/california_distilgpt2_100.pt")
great.load_finetuned_model("../great_private/models/california/california_distilgpt2_100.pt")
Generate Samples¶
In [14]:
Copied!
n_samples = 1000
n_samples = 1000
In [19]:
Copied!
samples = great.sample(n_samples, k=50, device="cuda:1")
samples = great.sample(n_samples, k=50, device="cuda:1")
1035it [00:09, 107.38it/s]
In [20]:
Copied!
samples.head()
samples.head()
Out[20]:
MedInc | HouseAge | AveRooms | AveBedrms | Population | AveOccup | Latitude | Longitude | MedHouseVal | |
---|---|---|---|---|---|---|---|---|---|
0 | 6.0990 | 34.0 | 6.242424 | 1.000000 | 1758.0 | 2.856757 | 33.68 | -117.88 | 4.09088 |
1 | 4.1875 | 31.0 | 5.879051 | 1.041030 | 1127.0 | 2.446928 | 38.63 | -121.35 | 1.53564 |
2 | 3.6000 | 16.0 | 6.837838 | 1.000000 | 693.0 | 2.790062 | 36.64 | -120.84 | 1.10397 |
3 | 4.8934 | 23.0 | 5.137055 | 0.945205 | 984.0 | 3.160714 | 38.00 | -121.45 | 1.64981 |
4 | 2.4861 | 52.0 | 4.694915 | 1.015965 | 867.0 | 2.251497 | 37.31 | -122.46 | 1.09459 |
In [21]:
Copied!
samples.shape
samples.shape
Out[21]:
(1000, 9)
In [22]:
Copied!
samples.to_csv("california_samples.csv")
samples.to_csv("california_samples.csv")
Plot Data¶
Original Data
In [26]:
Copied!
true_samples = data.sample(n = 1000)
true_samples = data.sample(n = 1000)
In [27]:
Copied!
plt.scatter(true_samples["Longitude"], true_samples["Latitude"], c=true_samples["MedHouseVal"])
plt.scatter(true_samples["Longitude"], true_samples["Latitude"], c=true_samples["MedHouseVal"])
Out[27]:
<matplotlib.collections.PathCollection at 0x7f32ca77d070>
Generated samples
In [28]:
Copied!
#samples = pd.read_csv("california_samples.csv")
#samples = pd.read_csv("california_samples.csv")
In [29]:
Copied!
plt.scatter(samples["Longitude"], samples["Latitude"], c=samples["MedHouseVal"])
plt.scatter(samples["Longitude"], samples["Latitude"], c=samples["MedHouseVal"])
Out[29]:
<matplotlib.collections.PathCollection at 0x7f32ca692fa0>
In [ ]:
Copied!
In [ ]:
Copied!
In [ ]:
Copied!