Neural Network for counting by hand...

Preview:

Citation preview

Neural Network for counting by hand gesture

Cristian D. Angulo L. - 6219701

Department of Computer Science, Faculty of Science and Technology, Assumption University in Thailand,

SC6611-Neural Networks and Deep Learning

Cristian Angulo

Neural Network for counting by hand gesture

Hand gesture recognition • Sign Language Digits • Dataset from kaggle.com Image processing • Convert images into int

class Net(nn.Module):

def __init__(self, num_channels):

super(Net, self).__init__()

self.num_channels = num_channels

self.conv1 = nn.Conv2d(3, self.num_channels, 3, stride=1, padding=1)

self.conv2 = nn.Conv2d(self.num_channels, self.num_channels*2, 3, stride=1, padding=1)

self.conv3 = nn.Conv2d(self.num_channels*2, self.num_channels*4, 3, stride=1, padding=1)

self.fc1 = nn.Linear(self.num_channels*4*8*8, self.num_channels*4)

self.fc2 = nn.Linear(self.num_channels*4,6)

def forward (self, x):

#image 3x64x64

x = self.conv1(x) #num_channels x 64 x 64

x = F.relu(F.max_pool2d(x, 2)) #it is going to divide the image into2

x = self.conv2(x) #num_channels*2 x 32 x 32

x = F.relu(F.max_pool2d(x, 2)) #num_channels*2 x 16 x 16

x = self.conv3(x) #num_channels*4 x 16 x 16

x = F.relu(F.max_pool2d(x, 2)) #num_channels*4 x 8 x 8

#flatten

x = x.view(-1, self.num_channels*4*8*8)

#fc

x = self.fc1(x)

x = F.relu(x)

x = self.fc2(x)

#log_sofmax

x = F.log_softmax(x, dim=1)

return x

STRUCTURE OF THE NEURAL NETWORK

Three convolutional layers

Two fully connected layers

Cristian Angulo

class Net(nn.Module):

def __init__(self, num_channels):

super(Net, self).__init__()

self.num_channels = num_channels

self.conv1 = nn.Conv2d(3, self.num_channels, 3, stride=1, padding=1)

self.conv2 = nn.Conv2d(self.num_channels, self.num_channels*2, 3, stride=1, padding=1)

self.conv3 = nn.Conv2d(self.num_channels*2, self.num_channels*4, 3, stride=1, padding=1)

self.fc1 = nn.Linear(self.num_channels*4*8*8, self.num_channels*4)

self.fc2 = nn.Linear(self.num_channels*4,6)

def forward (self, x):

#image 3x64x64

x = self.conv1(x) #num_channels x 64 x 64

x = F.relu(F.max_pool2d(x, 2)) #it is going to divide the image into2

x = self.conv2(x) #num_channels*2 x 32 x 32

x = F.relu(F.max_pool2d(x, 2)) #num_channels*2 x 16 x 16

x = self.conv3(x) #num_channels*4 x 16 x 16

x = F.relu(F.max_pool2d(x, 2)) #num_channels*4 x 8 x 8

#flatten

x = x.view(-1, self.num_channels*4*8*8)

#fc

x = self.fc1(x)

x = F.relu(x)

x = self.fc2(x)

#log_sofmax

x = F.log_softmax(x, dim=1)

return x

STRUCTURE OF THE NEURAL NETWORK

Three convolutional layers

Two fully connected layers

Cristian Angulo

class Net(nn.Module):

def __init__(self, num_channels):

super(Net, self).__init__()

self.num_channels = num_channels

self.conv1 = nn.Conv2d(3, self.num_channels, 3, stride=1, padding=1)

self.conv2 = nn.Conv2d(self.num_channels, self.num_channels*2, 3, stride=1, padding=1)

self.conv3 = nn.Conv2d(self.num_channels*2, self.num_channels*4, 3, stride=1, padding=1)

self.fc1 = nn.Linear(self.num_channels*4*8*8, self.num_channels*4)

self.fc2 = nn.Linear(self.num_channels*4,6)

def forward (self, x):

#image 3x64x64

x = self.conv1(x) #num_channels x 64 x 64

x = F.relu(F.max_pool2d(x, 2)) #it is going to divide the image into2

x = self.conv2(x) #num_channels*2 x 32 x 32

x = F.relu(F.max_pool2d(x, 2)) #num_channels*2 x 16 x 16

x = self.conv3(x) #num_channels*4 x 16 x 16

x = F.relu(F.max_pool2d(x, 2)) #num_channels*4 x 8 x 8

#flatten

x = x.view(-1, self.num_channels*4*8*8)

#fc

x = self.fc1(x)

x = F.relu(x)

x = self.fc2(x)

#log_sofmax

x = F.log_softmax(x, dim=1)

return x

STRUCTURE OF THE NEURAL NETWORK

Three convolutional layers

1. First convolutional >Input: 3 >Output: n 2. Second convolutional >Input: n >Output: 2*n 3. Third convolutional >Input: 2*n >Output: 4*n

Cristian Angulo

class Net(nn.Module):

def __init__(self, num_channels):

super(Net, self).__init__()

self.num_channels = num_channels

self.conv1 = nn.Conv2d(3, self.num_channels, 3, stride=1, padding=1)

self.conv2 = nn.Conv2d(self.num_channels, self.num_channels*2, 3, stride=1, padding=1)

self.conv3 = nn.Conv2d(self.num_channels*2, self.num_channels*4, 3, stride=1, padding=1)

self.fc1 = nn.Linear(self.num_channels*4*8*8, self.num_channels*4)

self.fc2 = nn.Linear(self.num_channels*4,6)

def forward (self, x):

#image 3x64x64

x = self.conv1(x) #num_channels x 64 x 64

x = F.relu(F.max_pool2d(x, 2)) #it is going to divide the image into2

x = self.conv2(x) #num_channels*2 x 32 x 32

x = F.relu(F.max_pool2d(x, 2)) #num_channels*2 x 16 x 16

x = self.conv3(x) #num_channels*4 x 16 x 16

x = F.relu(F.max_pool2d(x, 2)) #num_channels*4 x 8 x 8

#flatten

x = x.view(-1, self.num_channels*4*8*8)

#fc

x = self.fc1(x)

x = F.relu(x)

x = self.fc2(x)

#log_sofmax

x = F.log_softmax(x, dim=1)

return x

STRUCTURE OF THE NEURAL NETWORK

Three convolutional layers

1. First convolutional >Input: 3 x 64 x 64 >Output: n x 64 x 64 2. Second convolutional >Input: n x 32 x 32 >Output: 2n x 32 x 32 3. Third convolutional >Input: 2n x 16 x 16 >Output: 4n x 16 x 16

Cristian Angulo

class Net(nn.Module):

def __init__(self, num_channels):

super(Net, self).__init__()

self.num_channels = num_channels

self.conv1 = nn.Conv2d(3, self.num_channels, 3, stride=1, padding=1)

self.conv2 = nn.Conv2d(self.num_channels, self.num_channels*2, 3, stride=1, padding=1)

self.conv3 = nn.Conv2d(self.num_channels*2, self.num_channels*4, 3, stride=1, padding=1)

self.fc1 = nn.Linear(self.num_channels*4*8*8, self.num_channels*4)

self.fc2 = nn.Linear(self.num_channels*4,6)

def forward (self, x):

#image 3x64x64

x = self.conv1(x) #num_channels x 64 x 64

x = F.relu(F.max_pool2d(x, 2)) #it is going to divide the image into2

x = self.conv2(x) #num_channels*2 x 32 x 32

x = F.relu(F.max_pool2d(x, 2)) #num_channels*2 x 16 x 16

x = self.conv3(x) #num_channels*4 x 16 x 16

x = F.relu(F.max_pool2d(x, 2)) #num_channels*4 x 8 x 8

#flatten

x = x.view(-1, self.num_channels*4*8*8)

#fc

x = self.fc1(x)

x = F.relu(x)

x = self.fc2(x)

#log_sofmax

x = F.log_softmax(x, dim=1)

return x

STRUCTURE OF THE NEURAL NETWORK

Three convolutional layers

1. First convolutional >Input: 3 x 64 x 64 >Output: n x 64 x 64 First Max Pooling: >Input: n x 64 x 64 >Input: n x 32 x 32 2. Second convolutional >Input: n x 32 x 32 >Output: 2n x 32 x 32 Second Max Pooling: >Input: 2n x 32 x 32 >Input: 2n x 16 x 16 3. Third convolutional >Input: 2n x 16 x 16 >Output: 4n x 16 x 16 Third Max Pooling: >Input: 4n x 16 x 16 >Input: 4n x 8 x 8

Cristian Angulo

class Net(nn.Module):

def __init__(self, num_channels):

super(Net, self).__init__()

self.num_channels = num_channels

self.conv1 = nn.Conv2d(3, self.num_channels, 3, stride=1, padding=1)

self.conv2 = nn.Conv2d(self.num_channels, self.num_channels*2, 3, stride=1, padding=1)

self.conv3 = nn.Conv2d(self.num_channels*2, self.num_channels*4, 3, stride=1, padding=1)

self.fc1 = nn.Linear(self.num_channels*4*8*8, self.num_channels*4)

self.fc2 = nn.Linear(self.num_channels*4,6)

def forward (self, x):

#image 3x64x64

x = self.conv1(x) #num_channels x 64 x 64

x = F.relu(F.max_pool2d(x, 2)) #it is going to divide the image into2

x = self.conv2(x) #num_channels*2 x 32 x 32

x = F.relu(F.max_pool2d(x, 2)) #num_channels*2 x 16 x 16

x = self.conv3(x) #num_channels*4 x 16 x 16

x = F.relu(F.max_pool2d(x, 2)) #num_channels*4 x 8 x 8

#flatten

x = x.view(-1, self.num_channels*4*8*8)

#fc

x = self.fc1(x)

x = F.relu(x)

x = self.fc2(x)

#log_sofmax

x = F.log_softmax(x, dim=1)

return x

STRUCTURE OF THE NEURAL NETWORK

Two fully connected layers

1. First fully connected >Input: n x 4 x 8 x 8 >Output: n x 4 2. Second fully connected >Input: n x 4 >Output: 6

Cristian Angulo

class Net(nn.Module):

def __init__(self, num_channels):

super(Net, self).__init__()

self.num_channels = num_channels

self.conv1 = nn.Conv2d(3, self.num_channels, 3, stride=1, padding=1)

self.conv2 = nn.Conv2d(self.num_channels, self.num_channels*2, 3, stride=1, padding=1)

self.conv3 = nn.Conv2d(self.num_channels*2, self.num_channels*4, 3, stride=1, padding=1)

self.fc1 = nn.Linear(self.num_channels*4*8*8, self.num_channels*4)

self.fc2 = nn.Linear(self.num_channels*4,6)

def forward (self, x):

#image 3x64x64

x = self.conv1(x) #num_channels x 64 x 64

x = F.relu(F.max_pool2d(x, 2)) #it is going to divide the image into2

x = self.conv2(x) #num_channels*2 x 32 x 32

x = F.relu(F.max_pool2d(x, 2)) #num_channels*2 x 16 x 16

x = self.conv3(x) #num_channels*4 x 16 x 16

x = F.relu(F.max_pool2d(x, 2)) #num_channels*4 x 8 x 8

#flatten

x = x.view(-1, self.num_channels*4*8*8)

#fc

x = self.fc1(x)

x = F.relu(x)

x = self.fc2(x)

#log_sofmax

x = F.log_softmax(x, dim=1)

return x

STRUCTURE OF THE NEURAL NETWORK

Activation Function

ReLU or Rectified Linear Unit

Cristian Angulo

class Net(nn.Module):

def __init__(self, num_channels):

super(Net, self).__init__()

self.num_channels = num_channels

self.conv1 = nn.Conv2d(3, self.num_channels, 3, stride=1, padding=1)

self.conv2 = nn.Conv2d(self.num_channels, self.num_channels*2, 3, stride=1, padding=1)

self.conv3 = nn.Conv2d(self.num_channels*2, self.num_channels*4, 3, stride=1, padding=1)

self.fc1 = nn.Linear(self.num_channels*4*8*8, self.num_channels*4)

self.fc2 = nn.Linear(self.num_channels*4,6)

def forward (self, x):

#image 3x64x64

x = self.conv1(x) #num_channels x 64 x 64

x = F.relu(F.max_pool2d(x, 2)) #it is going to divide the image into2

x = self.conv2(x) #num_channels*2 x 32 x 32

x = F.relu(F.max_pool2d(x, 2)) #num_channels*2 x 16 x 16

x = self.conv3(x) #num_channels*4 x 16 x 16

x = F.relu(F.max_pool2d(x, 2)) #num_channels*4 x 8 x 8

#flatten

x = x.view(-1, self.num_channels*4*8*8)

#fc

x = self.fc1(x)

x = F.relu(x)

x = self.fc2(x)

#log_sofmax

x = F.log_softmax(x, dim=1)

return x

STRUCTURE OF THE NEURAL NETWORK

sofmax

This function uses an alternative formulation to compute the output and gradient correctly

Cristian Angulo

TENSOR OF THE IMAGES

Cristian Angulo

class SIGNSDataset (Dataset):

def __init__(self, base_dir, split='train', transform=None):

path = os.path.join(base_dir, '{}_signs'.format(split))

files = os.listdir (path)

self.filenames = [ os.path.join(path,f) for f in files if

f.endswith('.jpg')]

self.targets = [int(f[0]) for f in files]

self.transform = transform

def __len__(self):

return len(self.filenames)

def __getitem__(self, idx):

image = Image.open(self.filenames[idx])

if self.transform:

image = self.transform(image)

return image, self.targets[idx]

SIGNSDataset

• It will create a tuple with two objects; the first object is the tensor of the image, and the second object is the attribute value of the image.

>input: route of the images >output: tensor

TENSOR OF THE IMAGES

SIGNSDataset

Some parameters of this object are: >path where the files are. >split: training, validation, test >transform: ToTensor()

Cristian Angulo

class SIGNSDataset (Dataset):

def __init__(self, base_dir, split='train', transform=None):

path = os.path.join(base_dir, '{}_signs'.format(split))

files = os.listdir (path)

self.filenames = [ os.path.join(path,f) for f in files if

f.endswith('.jpg')]

self.targets = [int(f[0]) for f in files]

self.transform = transform

def __len__(self):

return len(self.filenames)

def __getitem__(self, idx):

image = Image.open(self.filenames[idx])

if self.transform:

image = self.transform(image)

return image, self.targets[idx]

TENSOR OF THE IMAGES

SIGNSDataset

DataLoader is one of the utilities of Pytorch to work with datasets. DataLoader represents a Python iterable over a dataset

Cristian Angulo

class SIGNSDataset (Dataset):

def __init__(self, base_dir, split='train', transform=None):

path = os.path.join(base_dir, '{}_signs'.format(split))

files = os.listdir (path)

self.filenames = [ os.path.join(path,f) for f in files if

f.endswith('.jpg')]

self.targets = [int(f[0]) for f in files]

self.transform = transform

def __len__(self):

return len(self.filenames)

def __getitem__(self, idx):

image = Image.open(self.filenames[idx])

if self.transform:

image = self.transform(image)

return image, self.targets[idx]

trainset = SIGNSDataset (route, split= 'train', transform=

transforms.ToTensor())

dataloader = DataLoader(trainset, batch_size=32)

input3 = []

for inputs, targets in dataloader:

out = make_grid(inputs)

imshow(out)

print (targets)

input3 = inputs

break tensor([1, 2, 1, 1, 3, 2, 0, 2, 1, 4, 3, 5, 1, 4, 1, 4, 1, 5, 4, 2, 3, 4, 1, 2, 3, 3, 0, 1, 4, 0, 5, 5])

IMPLEMENTATION

Cristian Angulo

Loss Function

The different between the outputs and the targets.

for epoch in range (num_epochs):

print('Epoch {}/{}'.format(epoch+1, num_epochs))

print('-'*10)

running_loss = RunningMetric() #loss

running_acc = RunningMetric() #accuracy

for inputs, targets in dataloader:

inputs, targets = inputs.to(device), targets.to(device)

optimizer.zero_grad()

outputs = net(inputs)

_, preds = torch.max(outputs, 1)

loss = loss_fn(outputs, targets)

loss.backward() #gradient

optimizer.step() #update the weights with the gradient

batch_size = inputs.size()[0]

running_loss.update(loss.item()*batch_size, batch_size)

running_acc.update(torch.sum(preds == targets).float(), batch_size)

print("Loss: {:.4f} Acc: {:.4f}". format(running_loss(),running_acc()))

loss_fn = nn.NLLLoss()

IMPLEMENTATION

Cristian Angulo

Back propagation

During the backwards pass, gradients from each node are averaged

for epoch in range (num_epochs):

print('Epoch {}/{}'.format(epoch+1, num_epochs))

print('-'*10)

running_loss = RunningMetric() #loss

running_acc = RunningMetric() #accuracy

for inputs, targets in dataloader:

inputs, targets = inputs.to(device), targets.to(device)

optimizer.zero_grad()

outputs = net(inputs)

_, preds = torch.max(outputs, 1)

loss = loss_fn(outputs, targets)

loss.backward() #gradient

optimizer.step() #update the weights with the gradient

batch_size = inputs.size()[0]

running_loss.update(loss.item()*batch_size, batch_size)

running_acc.update(torch.sum(preds == targets).float(), batch_size)

print("Loss: {:.4f} Acc: {:.4f}". format(running_loss(),running_acc()))

IMPLEMENTATION

Cristian Angulo

Update the weights

The optimizer updates the weights and uses some parameters such as: >learning rate and momentum

for epoch in range (num_epochs):

print('Epoch {}/{}'.format(epoch+1, num_epochs))

print('-'*10)

running_loss = RunningMetric() #loss

running_acc = RunningMetric() #accuracy

for inputs, targets in dataloader:

inputs, targets = inputs.to(device), targets.to(device)

optimizer.zero_grad()

outputs = net(inputs)

_, preds = torch.max(outputs, 1)

loss = loss_fn(outputs, targets)

loss.backward() #gradient

optimizer.step() #update the weights with the gradient

batch_size = inputs.size()[0]

running_loss.update(loss.item()*batch_size, batch_size)

running_acc.update(torch.sum(preds == targets).float(), batch_size)

print("Loss: {:.4f} Acc: {:.4f}". format(running_loss(),running_acc()))

TRAIN THE NEURAL NETWORK

Cristian Angulo

1. Neural network: structure

2. Optimizer: update the weights

3. Epoch: training times

for epoch in range (num_epochs):

print('Epoch {}/{}'.format(epoch+1, num_epochs))

print('-'*10)

running_loss = RunningMetric() #loss

running_acc = RunningMetric() #accuracy

for inputs, targets in dataloader:

inputs, targets = inputs.to(device), targets.to(device)

optimizer.zero_grad()

outputs = net(inputs)

_, preds = torch.max(outputs, 1)

loss = loss_fn(outputs, targets)

loss.backward() #gradient

optimizer.step() #update the weights with the gradient

batch_size = inputs.size()[0]

running_loss.update(loss.item()*batch_size, batch_size)

running_acc.update(torch.sum(preds == targets).float(), batch_size)

print("Loss: {:.4f} Acc: {:.4f}". format(running_loss(),running_acc()))

loss_fn = nn.NLLLoss()

optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

net.parameters()

num_epochs = 50

RESULTS

Cristian Angulo

for epoch in range (num_epochs):

print('Epoch {}/{}'.format(epoch+1, num_epochs))

print('-'*10)

running_loss = RunningMetric() #loss

running_acc = RunningMetric() #accuracy

for inputs, targets in dataloader:

inputs, targets = inputs.to(device), targets.to(device)

optimizer.zero_grad()

outputs = net(inputs)

_, preds = torch.max(outputs, 1)

loss = loss_fn(outputs, targets)

loss.backward() #gradient

optimizer.step() #update the weights with the gradient

batch_size = inputs.size()[0]

running_loss.update(loss.item()*batch_size, batch_size)

running_acc.update(torch.sum(preds == targets).float(), batch_size)

print("Loss: {:.4f} Acc: {:.4f}". format(running_loss(),running_acc()))

Epoch: 100/100

Loss: 0.1346 Accuracy: 0.9630

Training time: 0:01:59.808481

CONCEPTS

1. Convolutional layer

2. Max Pooling

3. Fully connected layer

4. Activation function

5. Forward propagation

6. Back propagation

7. Weights and updates

8. Learning rate

9. Epochs

10. Loss function

11. Accuracy

12. Image processing

REFERENCE: Prof.Ms.M.N.Bansode, Prof.S.D.Jambhale Dr. S. K. Dixit, Neural Network based finger counting Technique, International Journal of Scientific & Engineering Research, Volume 5, Issue 2, February-2014

Recommended