Core - unihan_etl.core
#
Build Unihan into tabular / structured format and export it.
- unihan_etl.core.filter_manifest(files)[source]#
Return filtered
UNIHAN_MANIFEST
from list of file names.
- exception unihan_etl.core.FieldNotFound(field)[source]#
Bases:
Exception
- Parameters:
field (str) –
- Return type:
None
- exception unihan_etl.core.FileNotSupported(field)[source]#
Bases:
Exception
- Parameters:
field (str) –
- Return type:
None
- unihan_etl.core.get_parser()[source]#
Return
argparse.ArgumentParser
instance for CLI.- Return type:
- Returns:
argparse.ArgumentParser
– argument parser for CLI use.
- unihan_etl.core.has_valid_zip(zip_path)[source]#
Return True if valid zip exists.
- Return type:
- Parameters:
zip_path (str or pathlib.Path) – absolute path to zip
- Returns:
True if valid zip exists at path
- Return type:
- unihan_etl.core.zip_has_files(files, zip_file)[source]#
Return True if zip has the files inside.
- Return type:
- Parameters:
zip_file (
zipfile.ZipFile
) –
- Returns:
True if files inside of :py:meth:`zipfile.ZipFile.namelist()
- Return type:
- unihan_etl.core.download(url, dest, urlretrieve_fn=<function urlretrieve>, reporthook=None, cache=True)[source]#
Download file at URL to a destination.
- Return type:
- Parameters:
url (str or pathlib.Path) – URL to download from.
dest (pathlib.Path) – file path where download is to be saved.
urlretrieve_fn (UrlRetrieveFn) – function to download file
reporthook (ReportHookFn, Optional) – Function to write progress bar to stdout buffer.
cache (bool) –
- Returns:
destination where file downloaded to.
- Return type:
- unihan_etl.core.extract_zip(zip_path, dest_dir)[source]#
Extract zip file. Return
zipfile.ZipFile
instance.- Return type:
- Parameters:
zip_file (pathlib.Path) – filepath to extract.
dest_dir (pathlib.Path) – directory to extract to.
zip_path (Path) –
- Returns:
The extracted zip.
- Return type:
- unihan_etl.core.normalize(raw_data, fields)[source]#
Return normalized data from a UNIHAN data files.
- unihan_etl.core.expand_delimiters(normalized_data)[source]#
Return expanded multi-value fields in UNIHAN.
- Return type:
ExpandedExport
- Parameters:
normalized_data (list of dict) – Expects data in list of hashes, per
core.normalize()
- Returns:
Items which have fields with delimiters and custom separation rules, will be expanded. Including multi-value fields not using both fields (so all fields stay consistent).
- Return type:
- unihan_etl.core.export_csv(data, destination, fields)[source]#
- Return type:
- Parameters:
data (UntypedNormalizedData) –
destination (StrPath) –
fields (ColumnData) –
- unihan_etl.core.export_json(data, destination)[source]#
- Return type:
- Parameters:
data (UntypedNormalizedData) –
destination (StrPath) –
- unihan_etl.core.export_yaml(data, destination)[source]#
- Return type:
- Parameters:
data (UntypedNormalizedData) –
destination (StrPath) –
- class unihan_etl.core.Packager(options=Options(source='http://www.unicode.org/Public/UNIDATA/Unihan.zip', destination=PosixPath('/home/runner/.local/share/unihan_etl/unihan.csv'), zip_path=PosixPath('/home/runner/.cache/unihan_etl/downloads/Unihan.zip'), work_dir=PosixPath('/home/runner/.cache/unihan_etl/downloads'), fields=('ucn', 'char', 'kAccountingNumeric', 'kBigFive', 'kCCCII', 'kCNS1986', 'kCNS1992', 'kCangjie', 'kCantonese', 'kCheungBauer', 'kCheungBauerIndex', 'kCihaiT', 'kCompatibilityVariant', 'kCowles', 'kDaeJaweon', 'kDefinition', 'kEACC', 'kFenn', 'kFennIndex', 'kFourCornerCode', 'kFrequency', 'kGB0', 'kGB1', 'kGB3', 'kGB5', 'kGB7', 'kGB8', 'kGSR', 'kGradeLevel', 'kHDZRadBreak', 'kHKGlyph', 'kHKSCS', 'kHanYu', 'kHangul', 'kHanyuPinlu', 'kHanyuPinyin', 'kIBMJapan', 'kIICore', 'kIRGDaeJaweon', 'kIRGDaiKanwaZiten', 'kIRGHanyuDaZidian', 'kIRGKangXi', 'kIRG_GSource', 'kIRG_HSource', 'kIRG_JSource', 'kIRG_KPSource', 'kIRG_KSource', 'kIRG_MSource', 'kIRG_TSource', 'kIRG_USource', 'kIRG_VSource', 'kJIS0213', 'kJa', 'kJapaneseKun', 'kJapaneseOn', 'kJinmeiyoKanji', 'kJis0', 'kJis1', 'kJoyoKanji', 'kKPS0', 'kKPS1', 'kKSC0', 'kKSC1', 'kKangXi', 'kKarlgren', 'kKorean', 'kKoreanEducationHanja', 'kKoreanName', 'kLau', 'kMainlandTelegraph', 'kMandarin', 'kMatthews', 'kMeyerWempe', 'kMorohashi', 'kNelson', 'kOtherNumeric', 'kPhonetic', 'kPrimaryNumeric', 'kPseudoGB1', 'kRSAdobe_Japan1_6', 'kRSJapanese', 'kRSKanWa', 'kRSKangXi', 'kRSKorean', 'kRSUnicode', 'kSBGY', 'kSemanticVariant', 'kSimplifiedVariant', 'kSpecializedSemanticVariant', 'kTGH', 'kTaiwanTelegraph', 'kTang', 'kTotalStrokes', 'kTraditionalVariant', 'kVietnamese', 'kXHC1983', 'kXerox', 'kZVariant'), format='csv', input_files=['Unihan_DictionaryIndices.txt', 'Unihan_DictionaryLikeData.txt', 'Unihan_IRGSources.txt', 'Unihan_NumericValues.txt', 'Unihan_OtherMappings.txt', 'Unihan_RadicalStrokeCounts.txt', 'Unihan_Readings.txt', 'Unihan_Variants.txt'], download=False, expand=True, prune_empty=True, cache=True, log_level='INFO'))[source]#
Bases:
object
Download and generate a tabular release of UNIHAN.
- __init__(options=Options(source='http://www.unicode.org/Public/UNIDATA/Unihan.zip', destination=PosixPath('/home/runner/.local/share/unihan_etl/unihan.csv'), zip_path=PosixPath('/home/runner/.cache/unihan_etl/downloads/Unihan.zip'), work_dir=PosixPath('/home/runner/.cache/unihan_etl/downloads'), fields=('ucn', 'char', 'kAccountingNumeric', 'kBigFive', 'kCCCII', 'kCNS1986', 'kCNS1992', 'kCangjie', 'kCantonese', 'kCheungBauer', 'kCheungBauerIndex', 'kCihaiT', 'kCompatibilityVariant', 'kCowles', 'kDaeJaweon', 'kDefinition', 'kEACC', 'kFenn', 'kFennIndex', 'kFourCornerCode', 'kFrequency', 'kGB0', 'kGB1', 'kGB3', 'kGB5', 'kGB7', 'kGB8', 'kGSR', 'kGradeLevel', 'kHDZRadBreak', 'kHKGlyph', 'kHKSCS', 'kHanYu', 'kHangul', 'kHanyuPinlu', 'kHanyuPinyin', 'kIBMJapan', 'kIICore', 'kIRGDaeJaweon', 'kIRGDaiKanwaZiten', 'kIRGHanyuDaZidian', 'kIRGKangXi', 'kIRG_GSource', 'kIRG_HSource', 'kIRG_JSource', 'kIRG_KPSource', 'kIRG_KSource', 'kIRG_MSource', 'kIRG_TSource', 'kIRG_USource', 'kIRG_VSource', 'kJIS0213', 'kJa', 'kJapaneseKun', 'kJapaneseOn', 'kJinmeiyoKanji', 'kJis0', 'kJis1', 'kJoyoKanji', 'kKPS0', 'kKPS1', 'kKSC0', 'kKSC1', 'kKangXi', 'kKarlgren', 'kKorean', 'kKoreanEducationHanja', 'kKoreanName', 'kLau', 'kMainlandTelegraph', 'kMandarin', 'kMatthews', 'kMeyerWempe', 'kMorohashi', 'kNelson', 'kOtherNumeric', 'kPhonetic', 'kPrimaryNumeric', 'kPseudoGB1', 'kRSAdobe_Japan1_6', 'kRSJapanese', 'kRSKanWa', 'kRSKangXi', 'kRSKorean', 'kRSUnicode', 'kSBGY', 'kSemanticVariant', 'kSimplifiedVariant', 'kSpecializedSemanticVariant', 'kTGH', 'kTaiwanTelegraph', 'kTang', 'kTotalStrokes', 'kTraditionalVariant', 'kVietnamese', 'kXHC1983', 'kXerox', 'kZVariant'), format='csv', input_files=['Unihan_DictionaryIndices.txt', 'Unihan_DictionaryLikeData.txt', 'Unihan_IRGSources.txt', 'Unihan_NumericValues.txt', 'Unihan_OtherMappings.txt', 'Unihan_RadicalStrokeCounts.txt', 'Unihan_Readings.txt', 'Unihan_Variants.txt'], download=False, expand=True, prune_empty=True, cache=True, log_level='INFO'))[source]#
- download(urlretrieve_fn=<function urlretrieve>)[source]#
Download raw UNIHAN data if not exists.
- Return type:
- Parameters:
urlretrieve_fn (function) – function to download file