Upload
others
View
9
Download
0
Embed Size (px)
Citation preview
Neural Network for counting by hand gesture
Cristian D. Angulo L. - 6219701
Department of Computer Science, Faculty of Science and Technology, Assumption University in Thailand,
SC6611-Neural Networks and Deep Learning
Cristian Angulo
Neural Network for counting by hand gesture
Hand gesture recognition • Sign Language Digits • Dataset from kaggle.com Image processing • Convert images into int
class Net(nn.Module):
def __init__(self, num_channels):
super(Net, self).__init__()
self.num_channels = num_channels
self.conv1 = nn.Conv2d(3, self.num_channels, 3, stride=1, padding=1)
self.conv2 = nn.Conv2d(self.num_channels, self.num_channels*2, 3, stride=1, padding=1)
self.conv3 = nn.Conv2d(self.num_channels*2, self.num_channels*4, 3, stride=1, padding=1)
self.fc1 = nn.Linear(self.num_channels*4*8*8, self.num_channels*4)
self.fc2 = nn.Linear(self.num_channels*4,6)
def forward (self, x):
#image 3x64x64
x = self.conv1(x) #num_channels x 64 x 64
x = F.relu(F.max_pool2d(x, 2)) #it is going to divide the image into2
x = self.conv2(x) #num_channels*2 x 32 x 32
x = F.relu(F.max_pool2d(x, 2)) #num_channels*2 x 16 x 16
x = self.conv3(x) #num_channels*4 x 16 x 16
x = F.relu(F.max_pool2d(x, 2)) #num_channels*4 x 8 x 8
#flatten
x = x.view(-1, self.num_channels*4*8*8)
#fc
x = self.fc1(x)
x = F.relu(x)
x = self.fc2(x)
#log_sofmax
x = F.log_softmax(x, dim=1)
return x
STRUCTURE OF THE NEURAL NETWORK
Three convolutional layers
Two fully connected layers
Cristian Angulo
class Net(nn.Module):
def __init__(self, num_channels):
super(Net, self).__init__()
self.num_channels = num_channels
self.conv1 = nn.Conv2d(3, self.num_channels, 3, stride=1, padding=1)
self.conv2 = nn.Conv2d(self.num_channels, self.num_channels*2, 3, stride=1, padding=1)
self.conv3 = nn.Conv2d(self.num_channels*2, self.num_channels*4, 3, stride=1, padding=1)
self.fc1 = nn.Linear(self.num_channels*4*8*8, self.num_channels*4)
self.fc2 = nn.Linear(self.num_channels*4,6)
def forward (self, x):
#image 3x64x64
x = self.conv1(x) #num_channels x 64 x 64
x = F.relu(F.max_pool2d(x, 2)) #it is going to divide the image into2
x = self.conv2(x) #num_channels*2 x 32 x 32
x = F.relu(F.max_pool2d(x, 2)) #num_channels*2 x 16 x 16
x = self.conv3(x) #num_channels*4 x 16 x 16
x = F.relu(F.max_pool2d(x, 2)) #num_channels*4 x 8 x 8
#flatten
x = x.view(-1, self.num_channels*4*8*8)
#fc
x = self.fc1(x)
x = F.relu(x)
x = self.fc2(x)
#log_sofmax
x = F.log_softmax(x, dim=1)
return x
STRUCTURE OF THE NEURAL NETWORK
Three convolutional layers
Two fully connected layers
Cristian Angulo
class Net(nn.Module):
def __init__(self, num_channels):
super(Net, self).__init__()
self.num_channels = num_channels
self.conv1 = nn.Conv2d(3, self.num_channels, 3, stride=1, padding=1)
self.conv2 = nn.Conv2d(self.num_channels, self.num_channels*2, 3, stride=1, padding=1)
self.conv3 = nn.Conv2d(self.num_channels*2, self.num_channels*4, 3, stride=1, padding=1)
self.fc1 = nn.Linear(self.num_channels*4*8*8, self.num_channels*4)
self.fc2 = nn.Linear(self.num_channels*4,6)
def forward (self, x):
#image 3x64x64
x = self.conv1(x) #num_channels x 64 x 64
x = F.relu(F.max_pool2d(x, 2)) #it is going to divide the image into2
x = self.conv2(x) #num_channels*2 x 32 x 32
x = F.relu(F.max_pool2d(x, 2)) #num_channels*2 x 16 x 16
x = self.conv3(x) #num_channels*4 x 16 x 16
x = F.relu(F.max_pool2d(x, 2)) #num_channels*4 x 8 x 8
#flatten
x = x.view(-1, self.num_channels*4*8*8)
#fc
x = self.fc1(x)
x = F.relu(x)
x = self.fc2(x)
#log_sofmax
x = F.log_softmax(x, dim=1)
return x
STRUCTURE OF THE NEURAL NETWORK
Three convolutional layers
1. First convolutional >Input: 3 >Output: n 2. Second convolutional >Input: n >Output: 2*n 3. Third convolutional >Input: 2*n >Output: 4*n
Cristian Angulo
class Net(nn.Module):
def __init__(self, num_channels):
super(Net, self).__init__()
self.num_channels = num_channels
self.conv1 = nn.Conv2d(3, self.num_channels, 3, stride=1, padding=1)
self.conv2 = nn.Conv2d(self.num_channels, self.num_channels*2, 3, stride=1, padding=1)
self.conv3 = nn.Conv2d(self.num_channels*2, self.num_channels*4, 3, stride=1, padding=1)
self.fc1 = nn.Linear(self.num_channels*4*8*8, self.num_channels*4)
self.fc2 = nn.Linear(self.num_channels*4,6)
def forward (self, x):
#image 3x64x64
x = self.conv1(x) #num_channels x 64 x 64
x = F.relu(F.max_pool2d(x, 2)) #it is going to divide the image into2
x = self.conv2(x) #num_channels*2 x 32 x 32
x = F.relu(F.max_pool2d(x, 2)) #num_channels*2 x 16 x 16
x = self.conv3(x) #num_channels*4 x 16 x 16
x = F.relu(F.max_pool2d(x, 2)) #num_channels*4 x 8 x 8
#flatten
x = x.view(-1, self.num_channels*4*8*8)
#fc
x = self.fc1(x)
x = F.relu(x)
x = self.fc2(x)
#log_sofmax
x = F.log_softmax(x, dim=1)
return x
STRUCTURE OF THE NEURAL NETWORK
Three convolutional layers
1. First convolutional >Input: 3 x 64 x 64 >Output: n x 64 x 64 2. Second convolutional >Input: n x 32 x 32 >Output: 2n x 32 x 32 3. Third convolutional >Input: 2n x 16 x 16 >Output: 4n x 16 x 16
Cristian Angulo
class Net(nn.Module):
def __init__(self, num_channels):
super(Net, self).__init__()
self.num_channels = num_channels
self.conv1 = nn.Conv2d(3, self.num_channels, 3, stride=1, padding=1)
self.conv2 = nn.Conv2d(self.num_channels, self.num_channels*2, 3, stride=1, padding=1)
self.conv3 = nn.Conv2d(self.num_channels*2, self.num_channels*4, 3, stride=1, padding=1)
self.fc1 = nn.Linear(self.num_channels*4*8*8, self.num_channels*4)
self.fc2 = nn.Linear(self.num_channels*4,6)
def forward (self, x):
#image 3x64x64
x = self.conv1(x) #num_channels x 64 x 64
x = F.relu(F.max_pool2d(x, 2)) #it is going to divide the image into2
x = self.conv2(x) #num_channels*2 x 32 x 32
x = F.relu(F.max_pool2d(x, 2)) #num_channels*2 x 16 x 16
x = self.conv3(x) #num_channels*4 x 16 x 16
x = F.relu(F.max_pool2d(x, 2)) #num_channels*4 x 8 x 8
#flatten
x = x.view(-1, self.num_channels*4*8*8)
#fc
x = self.fc1(x)
x = F.relu(x)
x = self.fc2(x)
#log_sofmax
x = F.log_softmax(x, dim=1)
return x
STRUCTURE OF THE NEURAL NETWORK
Three convolutional layers
1. First convolutional >Input: 3 x 64 x 64 >Output: n x 64 x 64 First Max Pooling: >Input: n x 64 x 64 >Input: n x 32 x 32 2. Second convolutional >Input: n x 32 x 32 >Output: 2n x 32 x 32 Second Max Pooling: >Input: 2n x 32 x 32 >Input: 2n x 16 x 16 3. Third convolutional >Input: 2n x 16 x 16 >Output: 4n x 16 x 16 Third Max Pooling: >Input: 4n x 16 x 16 >Input: 4n x 8 x 8
Cristian Angulo
class Net(nn.Module):
def __init__(self, num_channels):
super(Net, self).__init__()
self.num_channels = num_channels
self.conv1 = nn.Conv2d(3, self.num_channels, 3, stride=1, padding=1)
self.conv2 = nn.Conv2d(self.num_channels, self.num_channels*2, 3, stride=1, padding=1)
self.conv3 = nn.Conv2d(self.num_channels*2, self.num_channels*4, 3, stride=1, padding=1)
self.fc1 = nn.Linear(self.num_channels*4*8*8, self.num_channels*4)
self.fc2 = nn.Linear(self.num_channels*4,6)
def forward (self, x):
#image 3x64x64
x = self.conv1(x) #num_channels x 64 x 64
x = F.relu(F.max_pool2d(x, 2)) #it is going to divide the image into2
x = self.conv2(x) #num_channels*2 x 32 x 32
x = F.relu(F.max_pool2d(x, 2)) #num_channels*2 x 16 x 16
x = self.conv3(x) #num_channels*4 x 16 x 16
x = F.relu(F.max_pool2d(x, 2)) #num_channels*4 x 8 x 8
#flatten
x = x.view(-1, self.num_channels*4*8*8)
#fc
x = self.fc1(x)
x = F.relu(x)
x = self.fc2(x)
#log_sofmax
x = F.log_softmax(x, dim=1)
return x
STRUCTURE OF THE NEURAL NETWORK
Two fully connected layers
1. First fully connected >Input: n x 4 x 8 x 8 >Output: n x 4 2. Second fully connected >Input: n x 4 >Output: 6
Cristian Angulo
class Net(nn.Module):
def __init__(self, num_channels):
super(Net, self).__init__()
self.num_channels = num_channels
self.conv1 = nn.Conv2d(3, self.num_channels, 3, stride=1, padding=1)
self.conv2 = nn.Conv2d(self.num_channels, self.num_channels*2, 3, stride=1, padding=1)
self.conv3 = nn.Conv2d(self.num_channels*2, self.num_channels*4, 3, stride=1, padding=1)
self.fc1 = nn.Linear(self.num_channels*4*8*8, self.num_channels*4)
self.fc2 = nn.Linear(self.num_channels*4,6)
def forward (self, x):
#image 3x64x64
x = self.conv1(x) #num_channels x 64 x 64
x = F.relu(F.max_pool2d(x, 2)) #it is going to divide the image into2
x = self.conv2(x) #num_channels*2 x 32 x 32
x = F.relu(F.max_pool2d(x, 2)) #num_channels*2 x 16 x 16
x = self.conv3(x) #num_channels*4 x 16 x 16
x = F.relu(F.max_pool2d(x, 2)) #num_channels*4 x 8 x 8
#flatten
x = x.view(-1, self.num_channels*4*8*8)
#fc
x = self.fc1(x)
x = F.relu(x)
x = self.fc2(x)
#log_sofmax
x = F.log_softmax(x, dim=1)
return x
STRUCTURE OF THE NEURAL NETWORK
Activation Function
ReLU or Rectified Linear Unit
Cristian Angulo
class Net(nn.Module):
def __init__(self, num_channels):
super(Net, self).__init__()
self.num_channels = num_channels
self.conv1 = nn.Conv2d(3, self.num_channels, 3, stride=1, padding=1)
self.conv2 = nn.Conv2d(self.num_channels, self.num_channels*2, 3, stride=1, padding=1)
self.conv3 = nn.Conv2d(self.num_channels*2, self.num_channels*4, 3, stride=1, padding=1)
self.fc1 = nn.Linear(self.num_channels*4*8*8, self.num_channels*4)
self.fc2 = nn.Linear(self.num_channels*4,6)
def forward (self, x):
#image 3x64x64
x = self.conv1(x) #num_channels x 64 x 64
x = F.relu(F.max_pool2d(x, 2)) #it is going to divide the image into2
x = self.conv2(x) #num_channels*2 x 32 x 32
x = F.relu(F.max_pool2d(x, 2)) #num_channels*2 x 16 x 16
x = self.conv3(x) #num_channels*4 x 16 x 16
x = F.relu(F.max_pool2d(x, 2)) #num_channels*4 x 8 x 8
#flatten
x = x.view(-1, self.num_channels*4*8*8)
#fc
x = self.fc1(x)
x = F.relu(x)
x = self.fc2(x)
#log_sofmax
x = F.log_softmax(x, dim=1)
return x
STRUCTURE OF THE NEURAL NETWORK
sofmax
This function uses an alternative formulation to compute the output and gradient correctly
Cristian Angulo
TENSOR OF THE IMAGES
Cristian Angulo
class SIGNSDataset (Dataset):
def __init__(self, base_dir, split='train', transform=None):
path = os.path.join(base_dir, '{}_signs'.format(split))
files = os.listdir (path)
self.filenames = [ os.path.join(path,f) for f in files if
f.endswith('.jpg')]
self.targets = [int(f[0]) for f in files]
self.transform = transform
def __len__(self):
return len(self.filenames)
def __getitem__(self, idx):
image = Image.open(self.filenames[idx])
if self.transform:
image = self.transform(image)
return image, self.targets[idx]
SIGNSDataset
• It will create a tuple with two objects; the first object is the tensor of the image, and the second object is the attribute value of the image.
>input: route of the images >output: tensor
TENSOR OF THE IMAGES
SIGNSDataset
Some parameters of this object are: >path where the files are. >split: training, validation, test >transform: ToTensor()
Cristian Angulo
class SIGNSDataset (Dataset):
def __init__(self, base_dir, split='train', transform=None):
path = os.path.join(base_dir, '{}_signs'.format(split))
files = os.listdir (path)
self.filenames = [ os.path.join(path,f) for f in files if
f.endswith('.jpg')]
self.targets = [int(f[0]) for f in files]
self.transform = transform
def __len__(self):
return len(self.filenames)
def __getitem__(self, idx):
image = Image.open(self.filenames[idx])
if self.transform:
image = self.transform(image)
return image, self.targets[idx]
TENSOR OF THE IMAGES
SIGNSDataset
DataLoader is one of the utilities of Pytorch to work with datasets. DataLoader represents a Python iterable over a dataset
Cristian Angulo
class SIGNSDataset (Dataset):
def __init__(self, base_dir, split='train', transform=None):
path = os.path.join(base_dir, '{}_signs'.format(split))
files = os.listdir (path)
self.filenames = [ os.path.join(path,f) for f in files if
f.endswith('.jpg')]
self.targets = [int(f[0]) for f in files]
self.transform = transform
def __len__(self):
return len(self.filenames)
def __getitem__(self, idx):
image = Image.open(self.filenames[idx])
if self.transform:
image = self.transform(image)
return image, self.targets[idx]
trainset = SIGNSDataset (route, split= 'train', transform=
transforms.ToTensor())
dataloader = DataLoader(trainset, batch_size=32)
input3 = []
for inputs, targets in dataloader:
out = make_grid(inputs)
imshow(out)
print (targets)
input3 = inputs
break tensor([1, 2, 1, 1, 3, 2, 0, 2, 1, 4, 3, 5, 1, 4, 1, 4, 1, 5, 4, 2, 3, 4, 1, 2, 3, 3, 0, 1, 4, 0, 5, 5])
IMPLEMENTATION
Cristian Angulo
Loss Function
The different between the outputs and the targets.
for epoch in range (num_epochs):
print('Epoch {}/{}'.format(epoch+1, num_epochs))
print('-'*10)
running_loss = RunningMetric() #loss
running_acc = RunningMetric() #accuracy
for inputs, targets in dataloader:
inputs, targets = inputs.to(device), targets.to(device)
optimizer.zero_grad()
outputs = net(inputs)
_, preds = torch.max(outputs, 1)
loss = loss_fn(outputs, targets)
loss.backward() #gradient
optimizer.step() #update the weights with the gradient
batch_size = inputs.size()[0]
running_loss.update(loss.item()*batch_size, batch_size)
running_acc.update(torch.sum(preds == targets).float(), batch_size)
print("Loss: {:.4f} Acc: {:.4f}". format(running_loss(),running_acc()))
loss_fn = nn.NLLLoss()
IMPLEMENTATION
Cristian Angulo
Back propagation
During the backwards pass, gradients from each node are averaged
for epoch in range (num_epochs):
print('Epoch {}/{}'.format(epoch+1, num_epochs))
print('-'*10)
running_loss = RunningMetric() #loss
running_acc = RunningMetric() #accuracy
for inputs, targets in dataloader:
inputs, targets = inputs.to(device), targets.to(device)
optimizer.zero_grad()
outputs = net(inputs)
_, preds = torch.max(outputs, 1)
loss = loss_fn(outputs, targets)
loss.backward() #gradient
optimizer.step() #update the weights with the gradient
batch_size = inputs.size()[0]
running_loss.update(loss.item()*batch_size, batch_size)
running_acc.update(torch.sum(preds == targets).float(), batch_size)
print("Loss: {:.4f} Acc: {:.4f}". format(running_loss(),running_acc()))
IMPLEMENTATION
Cristian Angulo
Update the weights
The optimizer updates the weights and uses some parameters such as: >learning rate and momentum
for epoch in range (num_epochs):
print('Epoch {}/{}'.format(epoch+1, num_epochs))
print('-'*10)
running_loss = RunningMetric() #loss
running_acc = RunningMetric() #accuracy
for inputs, targets in dataloader:
inputs, targets = inputs.to(device), targets.to(device)
optimizer.zero_grad()
outputs = net(inputs)
_, preds = torch.max(outputs, 1)
loss = loss_fn(outputs, targets)
loss.backward() #gradient
optimizer.step() #update the weights with the gradient
batch_size = inputs.size()[0]
running_loss.update(loss.item()*batch_size, batch_size)
running_acc.update(torch.sum(preds == targets).float(), batch_size)
print("Loss: {:.4f} Acc: {:.4f}". format(running_loss(),running_acc()))
TRAIN THE NEURAL NETWORK
Cristian Angulo
1. Neural network: structure
2. Optimizer: update the weights
3. Epoch: training times
for epoch in range (num_epochs):
print('Epoch {}/{}'.format(epoch+1, num_epochs))
print('-'*10)
running_loss = RunningMetric() #loss
running_acc = RunningMetric() #accuracy
for inputs, targets in dataloader:
inputs, targets = inputs.to(device), targets.to(device)
optimizer.zero_grad()
outputs = net(inputs)
_, preds = torch.max(outputs, 1)
loss = loss_fn(outputs, targets)
loss.backward() #gradient
optimizer.step() #update the weights with the gradient
batch_size = inputs.size()[0]
running_loss.update(loss.item()*batch_size, batch_size)
running_acc.update(torch.sum(preds == targets).float(), batch_size)
print("Loss: {:.4f} Acc: {:.4f}". format(running_loss(),running_acc()))
loss_fn = nn.NLLLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
net.parameters()
num_epochs = 50
RESULTS
Cristian Angulo
for epoch in range (num_epochs):
print('Epoch {}/{}'.format(epoch+1, num_epochs))
print('-'*10)
running_loss = RunningMetric() #loss
running_acc = RunningMetric() #accuracy
for inputs, targets in dataloader:
inputs, targets = inputs.to(device), targets.to(device)
optimizer.zero_grad()
outputs = net(inputs)
_, preds = torch.max(outputs, 1)
loss = loss_fn(outputs, targets)
loss.backward() #gradient
optimizer.step() #update the weights with the gradient
batch_size = inputs.size()[0]
running_loss.update(loss.item()*batch_size, batch_size)
running_acc.update(torch.sum(preds == targets).float(), batch_size)
print("Loss: {:.4f} Acc: {:.4f}". format(running_loss(),running_acc()))
Epoch: 100/100
Loss: 0.1346 Accuracy: 0.9630
Training time: 0:01:59.808481
CONCEPTS
1. Convolutional layer
2. Max Pooling
3. Fully connected layer
4. Activation function
5. Forward propagation
6. Back propagation
7. Weights and updates
8. Learning rate
9. Epochs
10. Loss function
11. Accuracy
12. Image processing
REFERENCE: Prof.Ms.M.N.Bansode, Prof.S.D.Jambhale Dr. S. K. Dixit, Neural Network based finger counting Technique, International Journal of Scientific & Engineering Research, Volume 5, Issue 2, February-2014