8.文本文件编码检测与转换。
- import sys
- import os
- import argparse
- from chardet.universaldetector import UniversalDetector
- parser = argparse.ArgumentParser(description = '文本文件编码检测与转换')
- parser.add_argument('filePaths', nargs = '+',
- help = '检测或转换的文件路径')
- parser.add_argument('-e', '--encoding', nargs = '?', const = 'UTF-8',
- help = '''
- 方针编码。支持的编码有:
- ASCII, (Default) UTF-8 (with or without a BOM), UTF-16 (with a BOM),
- UTF-32 (with a BOM), Big5, GB2312/GB18030, EUC-TW, HZ-GB-2312, ISO-2022-CN, EUC-JP, SHIFT_JIS, ISO-2022-JP,
- ISO-2022-KR, KOI8-R, MacCyrillic, IBM855, IBM866, ISO-8859-5, windows-1251, ISO-8859-2, windows-1250, EUC-KR,
- ISO-8859-5, windows-1251, ISO-8859-1, windows-1252, ISO-8859-7, windows-1253, ISO-8859-8, windows-1255, TIS-620
- ''')
- parser.add_argument('-o', '--output',
- help = '输出目次')
- # 理会参数,获得一个 Namespace 工具
- args = parser.parse_args()
- # 输出目次不为空即视为开启转换, 若未指定转换编码,则默以为 UTF-8
- if args.output != None:
- if not args.encoding:
- # 默认行使编码 UTF-8
- args.encoding = 'UTF-8'
- # 检测用户提供的输出目次是否有用
- if not os.path.isdir(args.output):
- print('Invalid Directory: ' + args.output)
- sys.exit()
- else:
- if args.output[-1] != '/':
- args.output += '/'
- # 实例化一个通用检测器
- detector = UniversalDetector()
- print()
- print('Encoding (Confidence)',':','File path')
- for filePath in args.filePaths:
- # 检测文件路径是否有用,无效则跳过
- if not os.path.isfile(filePath):
- print('Invalid Path: ' + filePath)
- continue
- # 重置检测器
- detector.reset()
- # 以二进制模式读取文件
- for each in open(filePath, 'rb'):
- # 检测器读取数据
- detector.feed(each)
- # 若检测完成则跳出轮回
- if detector.done:
- break
- # 封锁检测器
- detector.close()
- # 读取功效
- charEncoding = detector.result['encoding']
- confidence = detector.result['confidence']
- # 打印信息
- if charEncoding is None:
- charEncoding = 'Unknown'
- confidence = 0.99
- print('{} {:>12} : {}'.format(charEncoding.rjust(8),
- '('+str(confidence*100)+'%)', filePath))
- if args.encoding and charEncoding != 'Unknown' and confidence > 0.6:
- # 若未配置输出目次则包围源文件
- outputPath = args.output + os.path.basename(filePath) if args.output else filePath
- with open(filePath, 'r', encoding = charEncoding, errors = 'replace') as f:
- temp = f.read()
- with open(outputPath, 'w', encoding = args.encoding, errors = 'replace') as f:
- f.write(temp)
最后两个剧本内容选至尝试楼的课程《行使 Python3 编写系列适用剧本》,课程对这两个剧本有具体的实现进程讲授,感乐趣的同窗可以直接前去尝试楼举办进修!
(编辑:湖南网)
【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容!
|