123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352 |
- """
- Metadata about languages used by our model training code for our
- SingleByteCharSetProbers. Could be used for other things in the future.
- This code is based on the language metadata from the uchardet project.
- """
- from string import ascii_letters
- from typing import List, Optional
- # TODO: Add Ukrainian (KOI8-U)
- class Language:
- """Metadata about a language useful for training models
- :ivar name: The human name for the language, in English.
- :type name: str
- :ivar iso_code: 2-letter ISO 639-1 if possible, 3-letter ISO code otherwise,
- or use another catalog as a last resort.
- :type iso_code: str
- :ivar use_ascii: Whether or not ASCII letters should be included in trained
- models.
- :type use_ascii: bool
- :ivar charsets: The charsets we want to support and create data for.
- :type charsets: list of str
- :ivar alphabet: The characters in the language's alphabet. If `use_ascii` is
- `True`, you only need to add those not in the ASCII set.
- :type alphabet: str
- :ivar wiki_start_pages: The Wikipedia pages to start from if we're crawling
- Wikipedia for training data.
- :type wiki_start_pages: list of str
- """
- def __init__(
- self,
- name: Optional[str] = None,
- iso_code: Optional[str] = None,
- use_ascii: bool = True,
- charsets: Optional[List[str]] = None,
- alphabet: Optional[str] = None,
- wiki_start_pages: Optional[List[str]] = None,
- ) -> None:
- super().__init__()
- self.name = name
- self.iso_code = iso_code
- self.use_ascii = use_ascii
- self.charsets = charsets
- if self.use_ascii:
- if alphabet:
- alphabet += ascii_letters
- else:
- alphabet = ascii_letters
- elif not alphabet:
- raise ValueError("Must supply alphabet if use_ascii is False")
- self.alphabet = "".join(sorted(set(alphabet))) if alphabet else None
- self.wiki_start_pages = wiki_start_pages
- def __repr__(self) -> str:
- param_str = ", ".join(
- f"{k}={v!r}" for k, v in self.__dict__.items() if not k.startswith("_")
- )
- return f"{self.__class__.__name__}({param_str})"
- LANGUAGES = {
- "Arabic": Language(
- name="Arabic",
- iso_code="ar",
- use_ascii=False,
- # We only support encodings that use isolated
- # forms, because the current recommendation is
- # that the rendering system handles presentation
- # forms. This means we purposefully skip IBM864.
- charsets=["ISO-8859-6", "WINDOWS-1256", "CP720", "CP864"],
- alphabet="ءآأؤإئابةتثجحخدذرزسشصضطظعغػؼؽؾؿـفقكلمنهوىيًٌٍَُِّ",
- wiki_start_pages=["الصفحة_الرئيسية"],
- ),
- "Belarusian": Language(
- name="Belarusian",
- iso_code="be",
- use_ascii=False,
- charsets=["ISO-8859-5", "WINDOWS-1251", "IBM866", "MacCyrillic"],
- alphabet="АБВГДЕЁЖЗІЙКЛМНОПРСТУЎФХЦЧШЫЬЭЮЯабвгдеёжзійклмнопрстуўфхцчшыьэюяʼ",
- wiki_start_pages=["Галоўная_старонка"],
- ),
- "Bulgarian": Language(
- name="Bulgarian",
- iso_code="bg",
- use_ascii=False,
- charsets=["ISO-8859-5", "WINDOWS-1251", "IBM855"],
- alphabet="АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯабвгдежзийклмнопрстуфхцчшщъьюя",
- wiki_start_pages=["Начална_страница"],
- ),
- "Czech": Language(
- name="Czech",
- iso_code="cz",
- use_ascii=True,
- charsets=["ISO-8859-2", "WINDOWS-1250"],
- alphabet="áčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ",
- wiki_start_pages=["Hlavní_strana"],
- ),
- "Danish": Language(
- name="Danish",
- iso_code="da",
- use_ascii=True,
- charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
- alphabet="æøåÆØÅ",
- wiki_start_pages=["Forside"],
- ),
- "German": Language(
- name="German",
- iso_code="de",
- use_ascii=True,
- charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
- alphabet="äöüßẞÄÖÜ",
- wiki_start_pages=["Wikipedia:Hauptseite"],
- ),
- "Greek": Language(
- name="Greek",
- iso_code="el",
- use_ascii=False,
- charsets=["ISO-8859-7", "WINDOWS-1253"],
- alphabet="αβγδεζηθικλμνξοπρσςτυφχψωάέήίόύώΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΣΤΥΦΧΨΩΆΈΉΊΌΎΏ",
- wiki_start_pages=["Πύλη:Κύρια"],
- ),
- "English": Language(
- name="English",
- iso_code="en",
- use_ascii=True,
- charsets=["ISO-8859-1", "WINDOWS-1252", "MacRoman"],
- wiki_start_pages=["Main_Page"],
- ),
- "Esperanto": Language(
- name="Esperanto",
- iso_code="eo",
- # Q, W, X, and Y not used at all
- use_ascii=False,
- charsets=["ISO-8859-3"],
- alphabet="abcĉdefgĝhĥijĵklmnoprsŝtuŭvzABCĈDEFGĜHĤIJĴKLMNOPRSŜTUŬVZ",
- wiki_start_pages=["Vikipedio:Ĉefpaĝo"],
- ),
- "Spanish": Language(
- name="Spanish",
- iso_code="es",
- use_ascii=True,
- charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
- alphabet="ñáéíóúüÑÁÉÍÓÚÜ",
- wiki_start_pages=["Wikipedia:Portada"],
- ),
- "Estonian": Language(
- name="Estonian",
- iso_code="et",
- use_ascii=False,
- charsets=["ISO-8859-4", "ISO-8859-13", "WINDOWS-1257"],
- # C, F, Š, Q, W, X, Y, Z, Ž are only for
- # loanwords
- alphabet="ABDEGHIJKLMNOPRSTUVÕÄÖÜabdeghijklmnoprstuvõäöü",
- wiki_start_pages=["Esileht"],
- ),
- "Finnish": Language(
- name="Finnish",
- iso_code="fi",
- use_ascii=True,
- charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
- alphabet="ÅÄÖŠŽåäöšž",
- wiki_start_pages=["Wikipedia:Etusivu"],
- ),
- "French": Language(
- name="French",
- iso_code="fr",
- use_ascii=True,
- charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
- alphabet="œàâçèéîïùûêŒÀÂÇÈÉÎÏÙÛÊ",
- wiki_start_pages=["Wikipédia:Accueil_principal", "Bœuf (animal)"],
- ),
- "Hebrew": Language(
- name="Hebrew",
- iso_code="he",
- use_ascii=False,
- charsets=["ISO-8859-8", "WINDOWS-1255"],
- alphabet="אבגדהוזחטיךכלםמןנסעףפץצקרשתװױײ",
- wiki_start_pages=["עמוד_ראשי"],
- ),
- "Croatian": Language(
- name="Croatian",
- iso_code="hr",
- # Q, W, X, Y are only used for foreign words.
- use_ascii=False,
- charsets=["ISO-8859-2", "WINDOWS-1250"],
- alphabet="abcčćdđefghijklmnoprsštuvzžABCČĆDĐEFGHIJKLMNOPRSŠTUVZŽ",
- wiki_start_pages=["Glavna_stranica"],
- ),
- "Hungarian": Language(
- name="Hungarian",
- iso_code="hu",
- # Q, W, X, Y are only used for foreign words.
- use_ascii=False,
- charsets=["ISO-8859-2", "WINDOWS-1250"],
- alphabet="abcdefghijklmnoprstuvzáéíóöőúüűABCDEFGHIJKLMNOPRSTUVZÁÉÍÓÖŐÚÜŰ",
- wiki_start_pages=["Kezdőlap"],
- ),
- "Italian": Language(
- name="Italian",
- iso_code="it",
- use_ascii=True,
- charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
- alphabet="ÀÈÉÌÒÓÙàèéìòóù",
- wiki_start_pages=["Pagina_principale"],
- ),
- "Lithuanian": Language(
- name="Lithuanian",
- iso_code="lt",
- use_ascii=False,
- charsets=["ISO-8859-13", "WINDOWS-1257", "ISO-8859-4"],
- # Q, W, and X not used at all
- alphabet="AĄBCČDEĘĖFGHIĮYJKLMNOPRSŠTUŲŪVZŽaąbcčdeęėfghiįyjklmnoprsštuųūvzž",
- wiki_start_pages=["Pagrindinis_puslapis"],
- ),
- "Latvian": Language(
- name="Latvian",
- iso_code="lv",
- use_ascii=False,
- charsets=["ISO-8859-13", "WINDOWS-1257", "ISO-8859-4"],
- # Q, W, X, Y are only for loanwords
- alphabet="AĀBCČDEĒFGĢHIĪJKĶLĻMNŅOPRSŠTUŪVZŽaābcčdeēfgģhiījkķlļmnņoprsštuūvzž",
- wiki_start_pages=["Sākumlapa"],
- ),
- "Macedonian": Language(
- name="Macedonian",
- iso_code="mk",
- use_ascii=False,
- charsets=["ISO-8859-5", "WINDOWS-1251", "MacCyrillic", "IBM855"],
- alphabet="АБВГДЃЕЖЗЅИЈКЛЉМНЊОПРСТЌУФХЦЧЏШабвгдѓежзѕијклљмнњопрстќуфхцчџш",
- wiki_start_pages=["Главна_страница"],
- ),
- "Dutch": Language(
- name="Dutch",
- iso_code="nl",
- use_ascii=True,
- charsets=["ISO-8859-1", "WINDOWS-1252", "MacRoman"],
- wiki_start_pages=["Hoofdpagina"],
- ),
- "Polish": Language(
- name="Polish",
- iso_code="pl",
- # Q and X are only used for foreign words.
- use_ascii=False,
- charsets=["ISO-8859-2", "WINDOWS-1250"],
- alphabet="AĄBCĆDEĘFGHIJKLŁMNŃOÓPRSŚTUWYZŹŻaąbcćdeęfghijklłmnńoóprsśtuwyzźż",
- wiki_start_pages=["Wikipedia:Strona_główna"],
- ),
- "Portuguese": Language(
- name="Portuguese",
- iso_code="pt",
- use_ascii=True,
- charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
- alphabet="ÁÂÃÀÇÉÊÍÓÔÕÚáâãàçéêíóôõú",
- wiki_start_pages=["Wikipédia:Página_principal"],
- ),
- "Romanian": Language(
- name="Romanian",
- iso_code="ro",
- use_ascii=True,
- charsets=["ISO-8859-2", "WINDOWS-1250"],
- alphabet="ăâîșțĂÂÎȘȚ",
- wiki_start_pages=["Pagina_principală"],
- ),
- "Russian": Language(
- name="Russian",
- iso_code="ru",
- use_ascii=False,
- charsets=[
- "ISO-8859-5",
- "WINDOWS-1251",
- "KOI8-R",
- "MacCyrillic",
- "IBM866",
- "IBM855",
- ],
- alphabet="абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ",
- wiki_start_pages=["Заглавная_страница"],
- ),
- "Slovak": Language(
- name="Slovak",
- iso_code="sk",
- use_ascii=True,
- charsets=["ISO-8859-2", "WINDOWS-1250"],
- alphabet="áäčďéíĺľňóôŕšťúýžÁÄČĎÉÍĹĽŇÓÔŔŠŤÚÝŽ",
- wiki_start_pages=["Hlavná_stránka"],
- ),
- "Slovene": Language(
- name="Slovene",
- iso_code="sl",
- # Q, W, X, Y are only used for foreign words.
- use_ascii=False,
- charsets=["ISO-8859-2", "WINDOWS-1250"],
- alphabet="abcčdefghijklmnoprsštuvzžABCČDEFGHIJKLMNOPRSŠTUVZŽ",
- wiki_start_pages=["Glavna_stran"],
- ),
- # Serbian can be written in both Latin and Cyrillic, but there's no
- # simple way to get the Latin alphabet pages from Wikipedia through
- # the API, so for now we just support Cyrillic.
- "Serbian": Language(
- name="Serbian",
- iso_code="sr",
- alphabet="АБВГДЂЕЖЗИЈКЛЉМНЊОПРСТЋУФХЦЧЏШабвгдђежзијклљмнњопрстћуфхцчџш",
- charsets=["ISO-8859-5", "WINDOWS-1251", "MacCyrillic", "IBM855"],
- wiki_start_pages=["Главна_страна"],
- ),
- "Thai": Language(
- name="Thai",
- iso_code="th",
- use_ascii=False,
- charsets=["ISO-8859-11", "TIS-620", "CP874"],
- alphabet="กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศษสหฬอฮฯะัาำิีึืฺุู฿เแโใไๅๆ็่้๊๋์ํ๎๏๐๑๒๓๔๕๖๗๘๙๚๛",
- wiki_start_pages=["หน้าหลัก"],
- ),
- "Turkish": Language(
- name="Turkish",
- iso_code="tr",
- # Q, W, and X are not used by Turkish
- use_ascii=False,
- charsets=["ISO-8859-3", "ISO-8859-9", "WINDOWS-1254"],
- alphabet="abcçdefgğhıijklmnoöprsştuüvyzâîûABCÇDEFGĞHIİJKLMNOÖPRSŞTUÜVYZÂÎÛ",
- wiki_start_pages=["Ana_Sayfa"],
- ),
- "Vietnamese": Language(
- name="Vietnamese",
- iso_code="vi",
- use_ascii=False,
- # Windows-1258 is the only common 8-bit
- # Vietnamese encoding supported by Python.
- # From Wikipedia:
- # For systems that lack support for Unicode,
- # dozens of 8-bit Vietnamese code pages are
- # available.[1] The most common are VISCII
- # (TCVN 5712:1993), VPS, and Windows-1258.[3]
- # Where ASCII is required, such as when
- # ensuring readability in plain text e-mail,
- # Vietnamese letters are often encoded
- # according to Vietnamese Quoted-Readable
- # (VIQR) or VSCII Mnemonic (VSCII-MNEM),[4]
- # though usage of either variable-width
- # scheme has declined dramatically following
- # the adoption of Unicode on the World Wide
- # Web.
- charsets=["WINDOWS-1258"],
- alphabet="aăâbcdđeêghiklmnoôơpqrstuưvxyAĂÂBCDĐEÊGHIKLMNOÔƠPQRSTUƯVXY",
- wiki_start_pages=["Chữ_Quốc_ngữ"],
- ),
- }
|