Constants - unihan_etl.constants

Constants for unihan_etl.

unihan_etl.constants.UNIHAN_MANIFEST = {'Unihan_DictionaryIndices.txt': ('kCheungBauerIndex', 'kCowles', 'kDaeJaweon', 'kFennIndex', 'kGSR', 'kHanYu', 'kIRGDaeJaweon', 'kIRGHanyuDaZidian', 'kIRGKangXi', 'kKangXi', 'kKarlgren', 'kLau', 'kMatthews', 'kMeyerWempe', 'kMorohashi', 'kNelson', 'kSBGY', 'kSMSZD2003Index'), 'Unihan_DictionaryLikeData.txt': ('kAlternateTotalStrokes', 'kCangjie', 'kCheungBauer', 'kCihaiT', 'kFenn', 'kFourCornerCode', 'kFrequency', 'kGradeLevel', 'kHDZRadBreak', 'kHKGlyph', 'kMojiJoho', 'kPhonetic', 'kStrange', 'kTotalStrokes', 'kUnihanCore2020'), 'Unihan_IRGSources.txt': ('kCompatibilityVariant', 'kIICore', 'kIRG_GSource', 'kIRG_HSource', 'kIRG_JSource', 'kIRG_KPSource', 'kIRG_KSource', 'kIRG_MSource', 'kIRG_SSource', 'kIRG_TSource', 'kIRG_USource', 'kIRG_UKSource', 'kIRG_VSource'), 'Unihan_NumericValues.txt': ('kAccountingNumeric', 'kOtherNumeric', 'kPrimaryNumeric', 'kVietnameseNumeric', 'kZhuangNumeric'), 'Unihan_OtherMappings.txt': ('kBigFive', 'kCCCII', 'kCNS1986', 'kCNS1992', 'kEACC', 'kGB0', 'kGB1', 'kGB3', 'kGB5', 'kGB7', 'kGB8', 'kIBMJapan', 'kJa', 'kJinmeiyoKanji', 'kJis0', 'kJis1', 'kJIS0213', 'kJoyoKanji', 'kKoreanEducationHanja', 'kKoreanName', 'kMainlandTelegraph', 'kPseudoGB1', 'kTaiwanTelegraph', 'kTGH', 'kXerox'), 'Unihan_RadicalStrokeCounts.txt': ('kRSAdobe_Japan1_6', 'kRSUnicode'), 'Unihan_Readings.txt': ('kCantonese', 'kDefinition', 'kHangul', 'kHanyuPinlu', 'kHanyuPinyin', 'kJapanese', 'kJapaneseKun', 'kJapaneseOn', 'kKorean', 'kMandarin', 'kSMSZD2003Readings', 'kTang', 'kTGHZ2013', 'kVietnamese', 'kXHC1983'), 'Unihan_Variants.txt': ('kSemanticVariant', 'kSimplifiedVariant', 'kSpecializedSemanticVariant', 'kSpoofingVariant', 'kTraditionalVariant', 'kZVariant')}

Dictionary of tuples mapping locations of files to fields

unihan_etl.constants.CUSTOM_DELIMITED_FIELDS = ('kDefinition', 'kDaeJaweon', 'kHDZRadBreak', 'kIRG_GSource', 'kIRG_HSource', 'kIRG_JSource', 'kIRG_KPSource', 'kIRG_KSource', 'kIRG_MSource', 'kIRG_SSource', 'kIRG_TSource', 'kIRG_USource', 'kIRG_UKSource', 'kIRG_VSource')

FIELDS with multiple values via custom delimiters

unihan_etl.constants.SPACE_DELIMITED_DICT_FIELDS = ('kAlternateTotalStrokes', 'kHanYu', 'kMandarin', 'kTGHZ2013', 'kSMSZD2003Index', 'kSMSZD2003Readings', 'kStrange', 'kTotalStrokes', 'kXHC1983')

Fields with multiple values UNIHAN delimits by spaces -> dict

unihan_etl.constants.SPACE_DELIMITED_LIST_FIELDS = ('kAccountingNumeric', 'kCantonese', 'kCCCII', 'kCheungBauer', 'kCheungBauerIndex', 'kCihaiT', 'kCowles', 'kFenn', 'kFennIndex', 'kFourCornerCode', 'kGSR', 'kHangul', 'kHanyuPinlu', 'kHanyuPinyin', 'kHKGlyph', 'kIBMJapan', 'kIICore', 'kIRGDaeJaweon', 'kIRGHanyuDaZidian', 'kIRGKangXi', 'kJa', 'kJapanese', 'kJapaneseKun', 'kJapaneseOn', 'kJinmeiyoKanji', 'kJis0', 'kJIS0213', 'kJis1', 'kJoyoKanji', 'kKangXi', 'kKarlgren', 'kKorean', 'kKoreanEducationHanja', 'kKoreanName', 'kLua', 'kMainlandTelegraph', 'kMatthews', 'kMeyerWempe', 'kMorohashi', 'kNelson', 'kOtherNumeric', 'kPhonetic', 'kPrimaryNumeric', 'kRSAdobe_Japan1_6', 'kRSUnicode', 'kSBGY', 'kSemanticVariant', 'kSimplifiedVariant', 'kSpecializedSemanticVariant', 'kSpoofingVariant', 'kTaiwanTelegraph', 'kTang', 'kTGH', 'kTraditionalVariant', 'kVietnamese', 'kVietnameseNumeric', 'kXerox', 'kZhuangNumeric', 'kZVariant')

Fields with multiple values UNIHAN delimits by spaces -> list

unihan_etl.constants.SPACE_DELIMITED_FIELDS = ('kAccountingNumeric', 'kCantonese', 'kCCCII', 'kCheungBauer', 'kCheungBauerIndex', 'kCihaiT', 'kCowles', 'kFenn', 'kFennIndex', 'kFourCornerCode', 'kGSR', 'kHangul', 'kHanyuPinlu', 'kHanyuPinyin', 'kHKGlyph', 'kIBMJapan', 'kIICore', 'kIRGDaeJaweon', 'kIRGHanyuDaZidian', 'kIRGKangXi', 'kJa', 'kJapanese', 'kJapaneseKun', 'kJapaneseOn', 'kJinmeiyoKanji', 'kJis0', 'kJIS0213', 'kJis1', 'kJoyoKanji', 'kKangXi', 'kKarlgren', 'kKorean', 'kKoreanEducationHanja', 'kKoreanName', 'kLua', 'kMainlandTelegraph', 'kMatthews', 'kMeyerWempe', 'kMorohashi', 'kNelson', 'kOtherNumeric', 'kPhonetic', 'kPrimaryNumeric', 'kRSAdobe_Japan1_6', 'kRSUnicode', 'kSBGY', 'kSemanticVariant', 'kSimplifiedVariant', 'kSpecializedSemanticVariant', 'kSpoofingVariant', 'kTaiwanTelegraph', 'kTang', 'kTGH', 'kTraditionalVariant', 'kVietnamese', 'kVietnameseNumeric', 'kXerox', 'kZhuangNumeric', 'kZVariant', 'kAlternateTotalStrokes', 'kHanYu', 'kMandarin', 'kTGHZ2013', 'kSMSZD2003Index', 'kSMSZD2003Readings', 'kStrange', 'kTotalStrokes', 'kXHC1983')

Any space delimited field regardless of expanded form

unihan_etl.constants.INDEX_FIELDS: Tuple[str, ...] = ('ucn', 'char')

Default index fields for unihan csv’s. You probably want these.

unihan_etl.constants.WORK_DIR = PosixPath('/home/runner/.cache/unihan_etl/downloads')

Directory to use for processing intermittent files.

unihan_etl.constants.UNIHAN_FILES = ['Unihan_DictionaryIndices.txt', 'Unihan_DictionaryLikeData.txt', 'Unihan_IRGSources.txt', 'Unihan_NumericValues.txt', 'Unihan_OtherMappings.txt', 'Unihan_RadicalStrokeCounts.txt', 'Unihan_Readings.txt', 'Unihan_Variants.txt']

Default Unihan Files

unihan_etl.constants.UNIHAN_URL = 'http://www.unicode.org/Public/UNIDATA/Unihan.zip'

URI of Unihan.zip data.

unihan_etl.constants.DESTINATION_DIR = PosixPath('/home/runner/.local/share/unihan_etl')

Filepath to output built CSV file to.

unihan_etl.constants.UNIHAN_ZIP_PATH = PosixPath('/home/runner/.cache/unihan_etl/downloads/Unihan.zip')

Filepath to download Zip file.

unihan_etl.constants.UNIHAN_FIELDS: Tuple[str, ...] = ('kAccountingNumeric', 'kAlternateTotalStrokes', 'kBigFive', 'kCCCII', 'kCNS1986', 'kCNS1992', 'kCangjie', 'kCantonese', 'kCheungBauer', 'kCheungBauerIndex', 'kCihaiT', 'kCompatibilityVariant', 'kCowles', 'kDaeJaweon', 'kDefinition', 'kEACC', 'kFenn', 'kFennIndex', 'kFourCornerCode', 'kFrequency', 'kGB0', 'kGB1', 'kGB3', 'kGB5', 'kGB7', 'kGB8', 'kGSR', 'kGradeLevel', 'kHDZRadBreak', 'kHKGlyph', 'kHanYu', 'kHangul', 'kHanyuPinlu', 'kHanyuPinyin', 'kIBMJapan', 'kIICore', 'kIRGDaeJaweon', 'kIRGHanyuDaZidian', 'kIRGKangXi', 'kIRG_GSource', 'kIRG_HSource', 'kIRG_JSource', 'kIRG_KPSource', 'kIRG_KSource', 'kIRG_MSource', 'kIRG_SSource', 'kIRG_TSource', 'kIRG_UKSource', 'kIRG_USource', 'kIRG_VSource', 'kJIS0213', 'kJa', 'kJapanese', 'kJapaneseKun', 'kJapaneseOn', 'kJinmeiyoKanji', 'kJis0', 'kJis1', 'kJoyoKanji', 'kKangXi', 'kKarlgren', 'kKorean', 'kKoreanEducationHanja', 'kKoreanName', 'kLau', 'kMainlandTelegraph', 'kMandarin', 'kMatthews', 'kMeyerWempe', 'kMojiJoho', 'kMorohashi', 'kNelson', 'kOtherNumeric', 'kPhonetic', 'kPrimaryNumeric', 'kPseudoGB1', 'kRSAdobe_Japan1_6', 'kRSUnicode', 'kSBGY', 'kSMSZD2003Index', 'kSMSZD2003Readings', 'kSemanticVariant', 'kSimplifiedVariant', 'kSpecializedSemanticVariant', 'kSpoofingVariant', 'kStrange', 'kTGH', 'kTGHZ2013', 'kTaiwanTelegraph', 'kTang', 'kTotalStrokes', 'kTraditionalVariant', 'kUnihanCore2020', 'kVietnamese', 'kVietnameseNumeric', 'kXHC1983', 'kXerox', 'kZVariant', 'kZhuangNumeric')

Default Unihan fields

unihan_etl.constants.ALLOWED_EXPORT_TYPES = ['json', 'csv', 'yaml']

Allowed export types