wine-pytorch.ipynb 41 KB

Imports for this Notebook:

# the basic imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch

# this loads the data, apparently
from torch.utils.data import DataLoader

Load and split the wine quality dataset with Pandas.

# load data (only get rid of the Id column)
wine_data = pd.read_csv("./WineQT.csv", delimiter=",").drop("Id", axis=1)

# split the dataset into model and test subsets
wine_model = wine_data.sample(frac=0.7, random_state=123)
wine_test = wine_data.drop(wine_train.index)

# further split the training set into train and validation
wine_train = wine_model.sample(frac=0.7, random_state=123)
wine_validate = wine_model.drop(wine_train.index)
# verify the produced data size matches the source
print("source:", len(wine_data))
print("train:", len(wine_train), "validate:", len(wine_validate), "test:", len(wine_test), "==", len(wine_train + wine_validate + wine_test))
source: 1143
train: 560 validate: 240 test: 343 == 1143
wine_train
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality
711 7.5 0.71 0.00 1.6 0.092 22.0 31.0 0.99635 3.38 0.58 10.0 6
936 9.1 0.34 0.42 1.8 0.058 9.0 18.0 0.99392 3.18 0.55 11.4 5
683 10.4 0.26 0.48 1.9 0.066 6.0 10.0 0.99724 3.33 0.87 10.9 6
904 8.5 0.40 0.40 6.3 0.050 3.0 10.0 0.99566 3.28 0.56 12.0 4
198 8.9 0.40 0.32 5.6 0.087 10.0 47.0 0.99910 3.38 0.77 10.5 7
... ... ... ... ... ... ... ... ... ... ... ... ...
1076 7.5 0.38 0.57 2.3 0.106 5.0 12.0 0.99605 3.36 0.55 11.4 6
709 8.9 0.32 0.31 2.0 0.088 12.0 19.0 0.99570 3.17 0.55 10.4 6
486 8.1 0.78 0.23 2.6 0.059 5.0 15.0 0.99700 3.37 0.56 11.3 5
182 7.7 0.41 0.76 1.8 0.611 8.0 45.0 0.99680 3.06 1.26 9.4 5
206 8.7 0.52 0.09 2.5 0.091 20.0 49.0 0.99760 3.34 0.86 10.6 7

560 rows × 12 columns

wine_validate
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality
309 7.0 0.620 0.18 1.5 0.062 7.0 50.0 0.99510 3.08 0.60 9.3 5
528 8.3 0.760 0.29 4.2 0.075 12.0 16.0 0.99650 3.45 0.68 11.5 6
150 8.2 0.570 0.26 2.2 0.060 28.0 65.0 0.99590 3.30 0.43 10.1 5
278 6.6 0.735 0.02 7.9 0.122 68.0 124.0 0.99940 3.47 0.53 9.9 5
490 8.6 0.490 0.51 2.0 0.422 16.0 62.0 0.99790 3.03 1.17 9.0 5
... ... ... ... ... ... ... ... ... ... ... ... ...
896 10.4 0.430 0.50 2.3 0.068 13.0 19.0 0.99600 3.10 0.87 11.4 6
922 7.6 1.580 0.00 2.1 0.137 5.0 9.0 0.99476 3.50 0.40 10.9 3
219 8.4 0.650 0.60 2.1 0.112 12.0 90.0 0.99730 3.20 0.52 9.2 5
970 7.3 0.740 0.08 1.7 0.094 10.0 45.0 0.99576 3.24 0.50 9.8 5
288 8.8 0.520 0.34 2.7 0.087 24.0 122.0 0.99820 3.26 0.61 9.5 5

240 rows × 12 columns

wine_test
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality
1 7.8 0.88 0.00 2.6 0.098 25.0 67.0 0.99680 3.20 0.68 9.8 5
2 7.8 0.76 0.04 2.3 0.092 15.0 54.0 0.99700 3.26 0.65 9.8 5
3 11.2 0.28 0.56 1.9 0.075 17.0 60.0 0.99800 3.16 0.58 9.8 6
6 7.9 0.60 0.06 1.6 0.069 15.0 59.0 0.99640 3.30 0.46 9.4 5
8 7.8 0.58 0.02 2.0 0.073 9.0 18.0 0.99680 3.36 0.57 9.5 7
... ... ... ... ... ... ... ... ... ... ... ... ...
1128 6.2 0.70 0.15 5.1 0.076 13.0 27.0 0.99622 3.54 0.60 11.9 6
1133 6.7 0.32 0.44 2.4 0.061 24.0 34.0 0.99484 3.29 0.80 11.6 7
1135 5.8 0.61 0.11 1.8 0.066 18.0 28.0 0.99483 3.55 0.66 10.9 6
1137 5.4 0.74 0.09 1.7 0.089 16.0 26.0 0.99402 3.67 0.56 11.6 6
1140 6.2 0.60 0.08 2.0 0.090 32.0 44.0 0.99490 3.45 0.58 10.5 5

343 rows × 12 columns

# isolate results from features
# now this is the hassle most of the time - convert whatever the input is to whatever it's expected to be
# in our case, pandas data frames to pytorch tensors
train_features = torch.tensor(wine_train.drop('quality', axis=1).values.astype(np.float32))
train_target = torch.tensor(wine_train['quality'].values.astype(np.int64))
validate_features = torch.tensor(wine_validate.drop('quality', axis=1).values.astype(np.float32))
validate_target = torch.tensor(wine_validate['quality'].values.astype(np.int64))
test_features = torch.tensor(wine_test.drop('quality', axis=1).values.astype(np.float32))
test_target = torch.tensor(wine_test['quality'].values.astype(np.int64))

train_data = torch.utils.data.TensorDataset(train_features, train_target)
validate_data = torch.utils.data.TensorDataset(validate_features, validate_target)
test_data = torch.utils.data.TensorDataset(test_features, test_target)
train_data[0]
(tensor([ 7.5000,  0.7100,  0.0000,  1.6000,  0.0920, 22.0000, 31.0000,  0.9963,
          3.3800,  0.5800, 10.0000]),
 tensor(6))
validate_data[0]
(tensor([ 7.0000,  0.6200,  0.1800,  1.5000,  0.0620,  7.0000, 50.0000,  0.9951,
          3.0800,  0.6000,  9.3000]),
 tensor(5))
test_data[0]
(tensor([ 7.8000,  0.8800,  0.0000,  2.6000,  0.0980, 25.0000, 67.0000,  0.9968,
          3.2000,  0.6800,  9.8000]),
 tensor(5))
# create data loaders
train_loader = torch.utils.data.DataLoader(train_data, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)
validate_loader = torch.utils.data.DataLoader(validate_data, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)
# check what device to use
from torch.accelerator import is_available, current_accelerator
if (is_available()):
    device = current_accelerator().type
    print(f"accelerated by {device}")
else:
    device = "cpu"
    print("no accelerator")
accelerated by mps
# define the model
print("**** DEFINING Linear/ReLU layer stack ****")

from torch import nn

class SeqNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(11, 512),
            nn.ReLU(),
            nn.Linear(512, 10),
            nn.Softmax(),
        )

    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits
**** DEFINING Linear/ReLU layer stack ****
# show what the model looks like
model = SeqNN() #.to(device)
print(model)
SeqNN(
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=11, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=10, bias=True)
    (3): Softmax(dim=None)
  )
)
# define the loss function and optimizer
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.90)
def train_epoch(ep_num):
    running_loss = 0.
    last_loss = 0.

    report_every = 5

    for idx, batch in enumerate(train_loader):
        # unpack the next batch
        features, labels = batch
    
        # ensure features are on the accelerator device (if any)
        features.to(device)
        
        # zero the optimizer gradients
        optimizer.zero_grad()
    
        # feed it to model
        outputs = model(features)
    
        # calculate the loss against the expected label
        loss = loss_fn(outputs, labels)
        loss.backward()
    
        # feed the new loss info to optimizer
        optimizer.step()
    
        # calculate whether this is an improvement or a degradation
        running_loss += loss.item()

        # report loss uppon every 20 items
        if idx % report_every == (report_every - 1):
            last_loss = running_loss / report_every
            print("  batch {} loss: {}".format(idx + 1, last_loss))
            tb_x = ep_num * len(train_loader) + idx + 1
            print("  loss/train: {}/{}".format(last_loss, tb_x))
            running_loss = 0.

    return last_loss
# train the model
print("**** TRAINING NN on supplied data ****")

epoch_no = 0
num_epochs = 5
best_vloss = 1000000

for epoch in range(num_epochs):
    print("EPOCH", epoch + 1)

    model.train(True)
    avg_loss = train_epoch(epoch_no)

    running_vloss = 0.0
    model.eval()
    with torch.no_grad():
        for idx, vdata in enumerate(validate_loader):
            vfeatures, vlabels = vdata
            voutputs = model(vfeatures)
            vloss = loss_fn(voutputs, vlabels)
            running_vloss += vloss

    avg_vloss = running_vloss / (idx + 1)
    print("LOSS train {} / valid {}".format(avg_loss, avg_vloss))

    if avg_vloss < best_vloss:
        best_vloss = avg_vloss

    epoch_no += 1
**** TRAINING NN on supplied data ****
EPOCH 1
  batch 5 loss: 2.4407766819000245
  loss/train: 2.4407766819000245/5
  batch 10 loss: 2.451565170288086
  loss/train: 2.451565170288086/10
  batch 15 loss: 2.43599009513855
  loss/train: 2.43599009513855/15
LOSS train 2.43599009513855 / valid 2.4329328536987305
EPOCH 2
  batch 5 loss: 2.444344425201416
  loss/train: 2.444344425201416/23
  batch 10 loss: 2.4506516456604004
  loss/train: 2.4506516456604004/28
  batch 15 loss: 2.4280431270599365
  loss/train: 2.4280431270599365/33
LOSS train 2.4280431270599365 / valid 2.4371285438537598
EPOCH 3
  batch 5 loss: 2.444630241394043
  loss/train: 2.444630241394043/41
  batch 10 loss: 2.4294994354248045
  loss/train: 2.4294994354248045/46
  batch 15 loss: 2.4486732959747313
  loss/train: 2.4486732959747313/51
LOSS train 2.4486732959747313 / valid 2.437175989151001
EPOCH 4
  batch 5 loss: 2.449464464187622
  loss/train: 2.449464464187622/59
  batch 10 loss: 2.435447835922241
  loss/train: 2.435447835922241/64
  batch 15 loss: 2.4380492210388183
  loss/train: 2.4380492210388183/69
LOSS train 2.4380492210388183 / valid 2.437028408050537
EPOCH 5
  batch 5 loss: 2.4409915447235107
  loss/train: 2.4409915447235107/77
  batch 10 loss: 2.443865346908569
  loss/train: 2.443865346908569/82
  batch 15 loss: 2.4377002716064453
  loss/train: 2.4377002716064453/87
LOSS train 2.4377002716064453 / valid 2.4371585845947266
model.eval()
SeqNN(
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=11, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=10, bias=True)
    (3): Softmax(dim=None)
  )
)