首页手记 python 获取 CASIA 脱机和在线手写汉字库（二）

python 获取 CASIA 脱机和在线手写汉字库（二）

标签：

Python

本篇文章我将「CASIA 脱机和在线手写汉字库」的特征数据汇总写入一个字典之中，以方便以后的使用。

import os
import sys
import zipfile, rarfile
import struct, pickle
import pandas as pd
import numpy as np
import tables as tb
import time


def getZ(filename):
    name, end = os.path.splitext(filename)
    if end == '.rar':
        Z = rarfile.RarFile(filename)
    elif end == '.zip':
        Z = zipfile.ZipFile(filename)
    return Z


class Bunch(dict):

    def __init__(self, *args, **kwds):
        super().__init__(*args, **kwds)
        self.__dict__ = self

        
class MPF(Bunch):

    def __init__(self, fp, *args, **kwds):
        super().__init__(*args, **kwds)
        self.fp = fp
        header_size = struct.unpack('l', self.fp.read(4))[0]
        self.code_format = self.fp.read(8).decode('ascii').rstrip('\x00')
        self.text = self.fp.read(header_size - 62).decode().rstrip('\x00')
        self.code_type = self.fp.read(20).decode('latin-1').rstrip('\x00')
        self.code_length = struct.unpack('h', self.fp.read(2))[0]
        self.data_type = self.fp.read(20).decode('ascii').rstrip('\x00')
        self.nrows = struct.unpack('l', self.fp.read(4))[0]
        self.ndims = struct.unpack('l', self.fp.read(4))[0]

    def __iter__(self):
        m = self.code_length + self.ndims 
        for i in range(0, m * self.nrows, m):
            label = self.fp.read(self.code_length).decode('gb18030')
            data = np.frombuffer(self.fp.read(self.ndims), np.uint8)
            yield data, label

            
class Writer(Bunch):
    
    def __init__(self, mpf, *args, **kwds):
        '''
        dtype 为 结构数组 array
        可将其转换为 pandas:: pd.DataFrame.from_dict(dict(self.feature))
        '''
        super().__init__(*args, **kwds)
        self.text = mpf.text
        t = np.dtype([('label', 'U', 2), ('feature', np.uint8, 512)])
        self.feature = np.array([(label, feature) for feature, label in iter(mpf)], dtype=t)

        
class Feature(Bunch):

    def __init__(self, root, set_name, *args, **kwds):
        super().__init__(*args, **kwds)
        filename, end = os.path.splitext(set_name)

        if 'HW' in filename and end == '.zip':
            if '_' not in filename:
                self.name = filename
                Z = getZ(f'{root}{set_name}')
                self._get_dataset(Z)
        else:
            #print(f'{filename}不是我们需要的文件！')
            pass

    def _get_dataset(self, Z):
        for name in Z.namelist():
            if name.endswith('.mpf'):
                writer_ = f"writer{os.path.splitext(name)[0].split('/')[1]}"

                with Z.open(name) as fp:
                    mpf = MPF(fp)
                    wt = Writer(mpf)
                    self[writer_] = wt
                    

class XFeature(Bunch):
    
     def __init__(self, root, *args, **kwds):
        super().__init__(*args, **kwds) 
        for filename in os.listdir(root):
            set_name, end = os.path.splitext(filename)
            if 'HW' in filename and end == '.zip':
                if '_' not in set_name:
                    setname = set_name.replace('.', '')
                    start = time.time()
                    self[setname] = Feature(root, filename)
                    print(f'{time.time() - start}秒，完成字典 {setname} 的创建！')
                    
                    
def bunch2json(bunch, path):
    with open(path, 'wb') as fp:
        pickle.dump(bunch, fp)
        
def json2bunch(path):
    with open(path, 'rb') as fp:
        X = pickle.load(fp)
    return X

将数据特征存储为结构数组

%%time
root = 'E:/OCR/CASIA/'

mpf = XFeature(root)

17.253000497817993秒，完成字典 HWDB10trn 的创建！
4.3190014362335205秒，完成字典 HWDB10tst 的创建！
12.178295135498047秒，完成字典 HWDB11trn 的创建！
2.823002338409424秒，完成字典 HWDB11tst 的创建！
17.44099521636963秒，完成字典 OLHWDB10trn 的创建！
4.106986999511719秒，完成字典 OLHWDB10tst 的创建！
15.28876519203186秒，完成字典 OLHWDB11trn 的创建！
3.2720530033111572秒，完成字典 OLHWDB11tst 的创建！
Wall time: 1min 16s

将上述的 8 个字典写入本地磁盘，以便以后使用。

%%time 
path = f'{root}mpf/feature.json'
bunch2json(mpf, path)

Wall time: 2min 9s

%%time 
path = f'{root}mpf/feature.json'

xarray = json2bunch(path)

Wall time: 28.3 s

从本地载入字典只需要 28.3 秒，已经十分快速了。因为：

size = os.path.getsize(f'{root}mpf/feature.json') / 1e9
print(f'特征字典的大小是 {size} GB !')

特征字典的大小是 2.793863326 GB !

any(xarray) == any(mpf)

True

`feature` 字典的结构

整个 feature 字典就是一个树结构！

其中，HWDB10trn 等表示的是数据集的名字，而 feature 则表示 writer... 的特征数组，text 则是对该 feature 的简单描述。

如果想要再次使用已经封装好的 feature 数据，而不需要重新生成字典，我们需要导入 json2bunch, XFeature, Feature, Writer（我将其打包为 xhw.py 脚本），例如：

import sys

sys.path.append('E:/xlab')
from base.xhw import json2bunch, XFeature, Feature, Writer

root = 'E:/OCR/CASIA/' 
path = f'{root}mpf/feature.json' 

feature = json2bunch(path)   # 这就是我们需要的特征数据字典