PyTorchの基本的なところ

テンソルの添字

>>> t = torch.zeros(2, 3, 4)

>>> t
tensor([[[0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.]],

        [[0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.]]])

>>> t.shape
torch.Size([2, 3, 4])

>>> t[0][1][2] = 1
>>> t
tensor([[[0., 0., 0., 0.],
         [0., 0., 1., 0.],
         [0., 0., 0., 0.]],

        [[0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.]]])

t.shape が [2, 3, 4] であるということは、4つのまとまりが3つあるまとまりが2つある、みたいな意味

view による変形

>>> t = torch.Tensor([[[1,2,3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]])
>>> t
tensor([[[ 1.,  2.,  3.],
         [ 4.,  5.,  6.]],

        [[ 7.,  8.,  9.],
         [10., 11., 12.]]])

>>> t.shape
torch.Size([2, 2, 3])

>>> t2 = t.view(6, 2) # 2個のまとまりを6個に変換
>>> t2
tensor([[ 1.,  2.],
        [ 3.,  4.],
        [ 5.,  6.],
        [ 7.,  8.],
        [ 9., 10.],
        [11., 12.]])
>>> t2.shape
torch.Size([6, 2])

>>> t3 = t.view(4, -1) # X個のまとまりを4個に変換
>>> t3
tensor([[ 1.,  2.,  3.],
        [ 4.,  5.,  6.],
        [ 7.,  8.,  9.],
        [10., 11., 12.]])
>>> t3.shape
torch.Size([4, 3])

>>> t.view(9, -1)
RuntimeError: shape '[9, -1]' is invalid for input of size 12

テンソルの形を要素数が同じという条件下で変形する -1を使うと他に指定された数値からかってに埋めてくれる

squeeze

素数が1の軸を削除してくれる

>>> t = torch.zeros(2, 1, 3, 1)
>>> t
tensor([[[[0.],
          [0.],
          [0.]]],

        [[[0.],
          [0.],
          [0.]]]])

>>> t.squeeze() # [2, 1, 3, 1] を [2, 3] に整形してくれる
tensor([[0., 0., 0.],
        [0., 0., 0.]])

>>> # 引数(dim)を指定すると dim番めの軸のみに着目
>>> t.squeeze(0) # 0番目は2なのでsqueezeされない
tensor([[[[0.],
          [0.],
          [0.]]],


        [[[0.],
          [0.],
          [0.]]]]) 

>>> t.squeeze(1) # 1番目は1なので [2, 3, 1] に整形
tensor([[[0.],
         [0.],
         [0.]],

        [[0.],
         [0.],
         [0.]]])

>>> t.squeeze(3) # 3番目は1なので[2, 1, 3]に整形
tensor([[[0., 0., 0.]],

        [[0., 0., 0.]]])

unsqueeze

squeezeの逆バージョン 要素数が1の軸を足してくれる dimが必須

>>> t = torch.zeros(2, 3)
>>> t
tensor([[0., 0., 0.],
        [0., 0., 0.]])
>>> t.unsqueeze(1)
tensor([[[0., 0., 0.]],

        [[0., 0., 0.]]])

>>> t.unsqueeze(2)
tensor([[[0.],
         [0.],
         [0.]],

        [[0.],
         [0.],
         [0.]]])

permute

>>> t = torch.Tensor([[[1,2,3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]])
>>> t
tensor([[[ 1.,  2.,  3.],
         [ 4.,  5.,  6.]],

        [[ 7.,  8.,  9.],
         [10., 11., 12.]]])

>>> t2 = t.permute(2, 0, 1)
>>> t2
tensor([[[ 1.,  4.],
         [ 7., 10.]],

        [[ 2.,  5.],
         [ 8., 11.]],

        [[ 3.,  6.],
         [ 9., 12.]]])

>>> t3 = t.permute(2, 1, 0)
>>> t3
tensor([[[ 1.,  7.],
         [ 4., 10.]],

        [[ 2.,  8.],
         [ 5., 11.]],

        [[ 3.,  9.],
         [ 6., 12.]]])

>>> t[1][0][2]
tensor(9.)

>>> t2[2][1][0]
tensor(9.)

>>> t3[2][0][1]
tensor(9.)

spacy と Multi30k

from torchtext.datasets import TranslationDataset, Multi30k
from torchtext.data import Field, BucketIterator
import spacy

!python -m spacy download en

spacy_en = spacy.load('en')

!python -m spacy download de

spacy_de = spacy.load('de')

l = spacy_en.tokenizer('This is a pen!')
type(l) # => spacy.tokens.doc.Doc
type(l[0]) # => spacy.tokens.token.Token
l[0].text # => 'This'

def tokenize_de(text):
    """
    Tokenizes German text from a string into a list of strings (tokens) and reverses it
    """
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings (tokens)
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

SRC = Field(tokenize = tokenize_de, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

TRG = Field(tokenize = tokenize_en, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)


train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'), 
                                                    fields = (SRC, TRG))

type(train_data) # => torchtext.datasets.translation.Multi30k

print(vars(train_data.examples[0]))
"""
=> {
  'src': [
    'zwei',
    'junge',
    'weiße',
    'männer',
    'sind',
    'im',
    'freien',
    'in',
    'der',
    'nähe',
    'vieler',
    'büsche',
    '.'
  ],
  'trg': [
    'two',
    'young',
    ',',
    'white',
    'males',
    'are',
    'outside',
    'near',
    'many',
    'bushes',
    '.'
  ]
}
"""

SRC.build_vocab(train_data, min_freq = 2)
TRG.build_vocab(train_data, min_freq = 2)


BATCH_SIZE = 128

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, 
    device = device)

for b in train_iterator:
  break

type(b) # => torchtext.data.batch.Batch
type(b.src) # => torch.Tensor
type(b.trg) # => torch.Tensor

b.src.shape # => [27, 128]
b.trg.shape # => [27, 128]
b.trg
"""
tensor([[   2,    2,    2,  ...,    2,    2,    2],
        [   4,   16,    4,  ...,    4,    4,    4],
        [   9,   50, 4224,  ...,   14,    9,   61],
        ...,
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1]], device='cuda:0')
"""
# b.trg[:, n] が n文目のはず
b.trg[:, 0]
"""
tensor([  2,   4,   9,  11,  74, 589,  17,   6,  43,  12,   4,  59,  77,   5,
          3,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1],
       device='cuda:0')
"""

for t in b.trg[:, 0]:
  print(TRG.vocab.itos[t])

"""
<sos>
a
man
and
some
bicycles
are
in
front
of
a
large
building
.
<eos>
<pad>
<pad>
<pad>
<pad>
<pad>
<pad>
<pad>
<pad>
<pad>
<pad>
<pad>
<pad>
"""