chardetect.py 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112
  1. """
  2. Script which takes one or more file paths and reports on their detected
  3. encodings
  4. Example::
  5. % chardetect somefile someotherfile
  6. somefile: windows-1252 with confidence 0.5
  7. someotherfile: ascii with confidence 1.0
  8. If no paths are provided, it takes its input from stdin.
  9. """
  10. import argparse
  11. import sys
  12. from typing import Iterable, List, Optional
  13. from .. import __version__
  14. from ..universaldetector import UniversalDetector
  15. def description_of(
  16. lines: Iterable[bytes],
  17. name: str = "stdin",
  18. minimal: bool = False,
  19. should_rename_legacy: bool = False,
  20. ) -> Optional[str]:
  21. """
  22. Return a string describing the probable encoding of a file or
  23. list of strings.
  24. :param lines: The lines to get the encoding of.
  25. :type lines: Iterable of bytes
  26. :param name: Name of file or collection of lines
  27. :type name: str
  28. :param should_rename_legacy: Should we rename legacy encodings to
  29. their more modern equivalents?
  30. :type should_rename_legacy: ``bool``
  31. """
  32. u = UniversalDetector(should_rename_legacy=should_rename_legacy)
  33. for line in lines:
  34. line = bytearray(line)
  35. u.feed(line)
  36. # shortcut out of the loop to save reading further - particularly useful if we read a BOM.
  37. if u.done:
  38. break
  39. u.close()
  40. result = u.result
  41. if minimal:
  42. return result["encoding"]
  43. if result["encoding"]:
  44. return f'{name}: {result["encoding"]} with confidence {result["confidence"]}'
  45. return f"{name}: no result"
  46. def main(argv: Optional[List[str]] = None) -> None:
  47. """
  48. Handles command line arguments and gets things started.
  49. :param argv: List of arguments, as if specified on the command-line.
  50. If None, ``sys.argv[1:]`` is used instead.
  51. :type argv: list of str
  52. """
  53. # Get command line arguments
  54. parser = argparse.ArgumentParser(
  55. description=(
  56. "Takes one or more file paths and reports their detected encodings"
  57. )
  58. )
  59. parser.add_argument(
  60. "input",
  61. help="File whose encoding we would like to determine. (default: stdin)",
  62. type=argparse.FileType("rb"),
  63. nargs="*",
  64. default=[sys.stdin.buffer],
  65. )
  66. parser.add_argument(
  67. "--minimal",
  68. help="Print only the encoding to standard output",
  69. action="store_true",
  70. )
  71. parser.add_argument(
  72. "-l",
  73. "--legacy",
  74. help="Rename legacy encodings to more modern ones.",
  75. action="store_true",
  76. )
  77. parser.add_argument(
  78. "--version", action="version", version=f"%(prog)s {__version__}"
  79. )
  80. args = parser.parse_args(argv)
  81. for f in args.input:
  82. if f.isatty():
  83. print(
  84. "You are running chardetect interactively. Press "
  85. "CTRL-D twice at the start of a blank line to signal the "
  86. "end of your input. If you want help, run chardetect "
  87. "--help\n",
  88. file=sys.stderr,
  89. )
  90. print(
  91. description_of(
  92. f, f.name, minimal=args.minimal, should_rename_legacy=args.legacy
  93. )
  94. )
  95. if __name__ == "__main__":
  96. main()