Szarny.io

There should be one-- and preferably only one --obvious way to do it.

Base64 Encoder + Decoder をPythonで実装する

はじめに

よくマルチメディアデータのエンコーディングに使われており,目にすることが多いBase64.いまいち仕組みがよく分かっていなかったので,イチから実装してみました.

仕組み

Base64変換の手順を以下に挙げる。

  1. 元データを6ビットずつに分割。(6ビットに満たない分は0を追加して6ビットにする)
  1. 各6ビットの値を変換表を使って4文字ずつ変換。(4文字に満たない分は = 記号を追加して4文字にする)

Base64 - Wikipedia より

つまり,8bitのASCIIを受け取って,6bitにセパレートして,変換表で変換するわけですね.
"AB"をエンコードすると,以下のような流れになるでしょう.

f:id:Szarny:20170904150236p:plain

Pythonで実装

import sys

# Conversion table of base64
encode_table = {
    "110000": "w","110001": "x","110101": "1","110100": "0",
    "010100": "U","010101": "V","001100": "M","001101": "N",
    "011110": "e","011111": "f","001001": "J","001000": "I",
    "011011": "b","011010": "a","000110": "G","000111": "H",
    "000011": "D","000010": "C","100100": "k","100101": "l",
    "111100": "8","111101": "9","100010": "i","100011": "j",
    "101110": "u","101111": "v","111001": "5","111000": "4",
    "101011": "r","101010": "q","110011": "z","110010": "y",
    "010010": "S","010011": "T","010111": "X","010110": "W",
    "110110": "2","110111": "3","011000": "Y","011001": "Z",
    "001111": "P","001110": "O","011101": "d","011100": "c",
    "001010": "K","001011": "L","101101": "t","000000": "A",
    "000001": "B","100111": "n","100110": "m","000101": "F",
    "000100": "E","111111": "/","111110": "+","100001": "h",
    "100000": "g","010001": "R","010000": "Q","101100": "s",
    "111010": "6","111011": "7","101000": "o","101001": "p"
}

decode_table = {
    'w': '110000', 'x': '110001', '1': '110101', '0': '110100',
    'U': '010100', 'V': '010101', 'M': '001100', 'N': '001101',
    'e': '011110', 'f': '011111', 'J': '001001', 'I': '001000',
    'b': '011011', 'a': '011010', 'G': '000110', 'H': '000111',
    'D': '000011', 'C': '000010', 'k': '100100', 'l': '100101',
    '8': '111100', '9': '111101', 'i': '100010', 'j': '100011',
    'u': '101110', 'v': '101111', '5': '111001', '4': '111000',
    'r': '101011', 'q': '101010', 'z': '110011', 'y': '110010',
    'S': '010010', 'T': '010011', 'X': '010111', 'W': '010110',
    '2': '110110', '3': '110111', 'Y': '011000', 'Z': '011001',
    'P': '001111', 'O': '001110', 'd': '011101', 'c': '011100',
    'K': '001010', 'L': '001011', 't': '101101', 'A': '000000',
    'B': '000001', 'n': '100111', 'm': '100110', 'F': '000101',
    'E': '000100', '/': '111111', '+': '111110', 'h': '100001',
    'g': '100000', 'R': '010001', 'Q': '010000', 's': '101100',
    '6': '111010', '7': '111011', 'o': '101000', 'p': '101001'
}

# Format checker
def is_invalid_format(argv):
    if len(argv) != 3:
        return True
    elif argv[1] == "-h":
        return True
    elif argv[1] == "" or  (argv[1] != "-d" and argv[1] != "-e" and argv[1] != "-b"):
        return True
    elif argv[2] == "":
        return True
    else:
        return False

# ASCII character -> 8-digit binary
def chr2bin(letter):
    binary = ""
    cardinals = [128, 64, 32, 16, 8, 4, 2, 1]
    ascii_num = ord(letter)

    for car in cardinals:
        if ascii_num >= car:
            ascii_num -= car
            binary += "1"
        else:
            binary += "0"

    return binary

# 8-digit binary -> ASCII character
def bin2chr(binary):
    cardinal = 128
    decimal = 0

    for digit in binary:
        if int(digit) == 1:
            decimal += cardinal
        cardinal //= 2

    return chr(decimal)

# base64 encoder
def encode(data):
    binary = ""
    six_separated_binary = []
    base64 = ""

    # ASCII -> binary
    for letter in data:
        binary += chr2bin(letter)

    # Zero padding
    if len(binary) % 6 != 0:
        binary += "0" * (6 - (len(binary) % 6))

    # Separation into 6-digit binary
    for idx in range(0, len(binary), 6):
        six_separated_binary.append(binary[idx:idx+6])

    # 6-digit binary -> base64 character
    for six_digit in six_separated_binary:
        base64 += encode_table[six_digit]

    # = padding
    if len(base64) % 4 != 0:
        base64 += "=" * (4 - (len(base64) % 4))

    print("[Encode]", base64)

# base64 decoder
def decode(base64):
    data = ""
    binary = ""

    # Remove = padding
    base64 = base64.replace("=","")

    # base64 character -> 6-digit binary
    for base64char in base64:
        binary += decode_table[base64char]

    # Remove zero padding
    char_len = len(binary) // 8
    if len(binary) % 8 != 0:
        binary = binary[0:char_len*8]

    start = 0
    end   = 8
    while end <= len(binary):
        data += bin2chr(binary[start:end])
        start += 8
        end += 8

    print("[Decode]", data)

def main():
    if is_invalid_format(sys.argv):
        print("[*] Usage")
        print("python3 base64tool.py [-d decode | -e encode | -b both] \"data\"")
        sys.exit(0)

    print("[Origin]", sys.argv[2])

    if sys.argv[1] == "-e":
        encode(sys.argv[2])
    elif sys.argv[1] == "-d":
        decode(sys.argv[2])
    else:
        encode(sys.argv[2])
        decode(sys.argv[2])

if __name__ == '__main__':
    main()

基本的には,上記の手順を1つ1つ実行しているだけです.
シフト演算等は用いず,なるべく分かりやすい形でコーディングしました.

ASCIIとか進数変換のあたりで若干詰まってしまいましたが,何とかなりました.(組み込み関数はないようですね)

テスト

# python base64tool.py 
[*] Usage
python3 base64tool.py [-d decode | -e encode | -b both] "data"

# python base64tool.py -e "AB"
[Origin] AB
[Encode] QUI=

# python base64tool.py -d "QUI="
[Origin] QUI=
[Decode] AB

# python base64tool.py -e "This is python program."
[Origin] This is python program.
[Encode] VGhpcyBpcyBweXRob24gcHJvZ3JhbS4=

# python base64tool.py -d "VGhpcyBpcyBweXRob24gcHJvZ3JhbS4="
[Origin] VGhpcyBpcyBweXRob24gcHJvZ3JhbS4=
[Decode] This is python program.

動きました\(^o^)/
いつも使ってるものでも,敢えてイチから作り直してみると楽しいですね!

残念ながらマルチバイト文字には対応していないので,今後の課題としたいと思います.