2014-06-05 12:25:56 +00:00
|
|
|
'\" t -*- coding: UTF-8 -*-
|
2004-11-03 13:51:07 +00:00
|
|
|
.\" Copyright (c) 1996 Eric S. Raymond <esr@thyrsus.com>
|
_exit.2, access.2, alarm.2, close.2, link.2, mkdir.2, mknod.2, open.2, read.2, rename.2, rmdir.2, s390_runtime_instr.2, symlink.2, unlink.2, write.2, remove.3, charsets.7: srcfix: Tidy copyright notices
Signed-off-by: Michael Kerrisk <mtk.manpages@gmail.com>
2013-03-16 08:39:00 +00:00
|
|
|
.\" and Copyright (c) Andries Brouwer <aeb@cwi.nl>
|
2004-11-03 13:51:07 +00:00
|
|
|
.\"
|
MB_CUR_MAX.3, MB_LEN_MAX.3, btowc.3, fgetwc.3, fgetws.3, fputwc.3, fputws.3, fwide.3, getwchar.3, iconv.3, iconv_close.3, iconv_open.3, iswalnum.3, iswalpha.3, iswblank.3, iswcntrl.3, iswctype.3, iswdigit.3, iswgraph.3, iswlower.3, iswprint.3, iswpunct.3, iswspace.3, iswupper.3, iswxdigit.3, mblen.3, mbrlen.3, mbrtowc.3, mbsinit.3, mbsnrtowcs.3, mbsrtowcs.3, mbstowcs.3, mbtowc.3, nl_langinfo.3, putwchar.3, stpncpy.3, strnlen.3, towctrans.3, towlower.3, towupper.3, ungetwc.3, wcpcpy.3, wcpncpy.3, wcrtomb.3, wcscasecmp.3, wcscat.3, wcschr.3, wcscmp.3, wcscpy.3, wcscspn.3, wcsdup.3, wcslen.3, wcsncasecmp.3, wcsncat.3, wcsncmp.3, wcsncpy.3, wcsnlen.3, wcsnrtombs.3, wcspbrk.3, wcsrchr.3, wcsrtombs.3, wcsspn.3, wcsstr.3, wcstok.3, wcstombs.3, wcswidth.3, wctob.3, wctomb.3, wctrans.3, wctype.3, wcwidth.3, wmemchr.3, wmemcmp.3, wmemcpy.3, wmemmove.3, wmemset.3, wprintf.3, console.4, console_codes.4, random.4, charsets.7: s/GPLv2+_ONEPARA_DOC/GPLv2+_DOC_ONEPARA/
Signed-off-by: Michael Kerrisk <mtk.manpages@gmail.com>
2013-03-10 09:29:44 +00:00
|
|
|
.\" %%%LICENSE_START(GPLv2+_DOC_ONEPARA)
|
2004-11-03 13:51:07 +00:00
|
|
|
.\" This is free documentation; you can redistribute it and/or
|
|
|
|
.\" modify it under the terms of the GNU General Public License as
|
|
|
|
.\" published by the Free Software Foundation; either version 2 of
|
|
|
|
.\" the License, or (at your option) any later version.
|
2013-03-10 09:28:56 +00:00
|
|
|
.\" %%%LICENSE_END
|
2004-11-03 13:51:07 +00:00
|
|
|
.\"
|
|
|
|
.\" This is combined from many sources, including notes by aeb and
|
|
|
|
.\" research by esr. Portions derive from a writeup by Roman Czyborra.
|
|
|
|
.\"
|
2014-06-05 09:00:14 +00:00
|
|
|
.\" Changes also by David Starner <dstarner98@aasaa.ofe.org>.
|
2014-05-28 09:42:47 +00:00
|
|
|
.\"
|
access.2, bdflush.2, bind.2, brk.2, chmod.2, chown.2, clone.2, epoll_wait.2, execve.2, fsync.2, getgroups.2, gethostname.2, getpagesize.2, getpriority.2, getrlimit.2, ioperm.2, kexec_load.2, link.2, mkdir.2, mmap.2, msgop.2, perf_event_open.2, process_vm_readv.2, ptrace.2, readlink.2, readv.2, recv.2, rename.2, sched_setaffinity.2, select.2, send.2, seteuid.2, signal.2, sigwaitinfo.2, stat.2, symlink.2, sync.2, sync_file_range.2, sysinfo.2, timer_create.2, uname.2, unlink.2, utime.2, wait.2, abs.3, atoi.3, catopen.3, cerf.3, cexp2.3, clearenv.3, clog2.3, ctime.3, des_crypt.3, ecvt.3, fgetgrent.3, flockfile.3, fseeko.3, ftime.3, ftok.3, ftw.3, getauxval.3, getcwd.3, getdtablesize.3, getgrent.3, getgrent_r.3, getgrnam.3, getgrouplist.3, getline.3, getpass.3, getutent.3, glob.3, insque.3, lseek64.3, memmem.3, mkstemp.3, mktemp.3, on_exit.3, openpty.3, putenv.3, qecvt.3, realpath.3, remove.3, setbuf.3, sigpause.3, strftime.3, strptime.3, strstr.3, strtod.3, tzset.3, updwtmp.3, xcrypt.3, core.5, utmp.5, capabilities.7, charsets.7, environ.7, ipv6.7, man-pages.7, packet.7, vdso.7: tstamp
Signed-off-by: Michael Kerrisk <mtk.manpages@gmail.com>
2014-08-19 16:55:41 +00:00
|
|
|
.TH CHARSETS 7 2014-08-19 "Linux" "Linux Programmer's Manual"
|
2004-11-03 13:51:07 +00:00
|
|
|
.SH NAME
|
2014-06-05 09:00:14 +00:00
|
|
|
charsets - character set standards and internationalization
|
2004-11-03 13:51:07 +00:00
|
|
|
.SH DESCRIPTION
|
2014-06-05 09:00:14 +00:00
|
|
|
This manual page gives an overview on different character set standards
|
|
|
|
and how they were used on Linux before Unicode became ubiquitous.
|
|
|
|
Some of this information is still helpful for people working with legacy
|
|
|
|
systems and documents.
|
|
|
|
.LP
|
|
|
|
Standards discussed include such as
|
|
|
|
ASCII, GB 2312, ISO 8859, JIS, KOI8-R, KS, and Unicode.
|
2004-11-03 13:51:07 +00:00
|
|
|
.LP
|
2014-06-05 09:00:14 +00:00
|
|
|
The primary emphasis is on character sets that were actually used by
|
|
|
|
locale character sets, not the myriad others that could be found in data
|
2004-11-03 13:51:07 +00:00
|
|
|
from other systems.
|
2007-06-15 19:55:07 +00:00
|
|
|
.SS ASCII
|
2004-11-03 13:51:07 +00:00
|
|
|
ASCII (American Standard Code For Information Interchange) is the original
|
2007-04-12 22:42:49 +00:00
|
|
|
7-bit character set, originally designed for American English.
|
2014-06-05 09:00:14 +00:00
|
|
|
Also known as US-ASCII.
|
|
|
|
It is currently described by the ISO 646:1991 IRV
|
|
|
|
(International Reference Version) standard.
|
2004-11-03 13:51:07 +00:00
|
|
|
.LP
|
|
|
|
Various ASCII variants replacing the dollar sign with other currency
|
2014-06-05 09:00:14 +00:00
|
|
|
symbols and replacing punctuation with non-English alphabetic
|
|
|
|
characters to cover German, French, Spanish, and others in 7 bits
|
|
|
|
emerged.
|
|
|
|
All are deprecated;
|
|
|
|
glibc does not support locales whose character sets are not true
|
|
|
|
supersets of ASCII.
|
2004-11-03 13:51:07 +00:00
|
|
|
.LP
|
2014-06-05 09:00:14 +00:00
|
|
|
As Unicode, when using UTF-8, is ASCII-compatible, plain ASCII text
|
|
|
|
still renders properly on modern UTF-8 using systems.
|
2007-06-15 19:55:07 +00:00
|
|
|
.SS ISO 8859
|
2014-06-05 12:25:56 +00:00
|
|
|
ISO 8859 is a series of 15 8-bit character sets, all of which have ASCII
|
2014-06-05 09:00:14 +00:00
|
|
|
in their low (7-bit) half, invisible control characters in positions
|
|
|
|
128 to 159, and 96 fixed-width graphics in positions 160-255.
|
2004-11-03 13:51:07 +00:00
|
|
|
.LP
|
2014-06-05 09:00:14 +00:00
|
|
|
Of these, the most important is ISO 8859-1
|
|
|
|
("Latin Alphabet No .1" / Latin-1).
|
|
|
|
It was widely adopted and supported by different systems,
|
|
|
|
and is gradually being replaced with Unicode.
|
|
|
|
The ISO 8859-1 characters are also the first 256 characters of Unicode.
|
2004-11-03 13:51:07 +00:00
|
|
|
.LP
|
|
|
|
Console support for the other 8859 character sets is available under
|
|
|
|
Linux through user-mode utilities (such as
|
|
|
|
.BR setfont (8))
|
|
|
|
that modify keyboard bindings and the EGA graphics
|
|
|
|
table and employ the "user mapping" font table in the console
|
|
|
|
driver.
|
|
|
|
.LP
|
|
|
|
Here are brief descriptions of each set:
|
|
|
|
.TP
|
|
|
|
8859-1 (Latin-1)
|
2014-06-05 09:00:14 +00:00
|
|
|
Latin-1 covers many West European languages such as Albanian, Basque,
|
2014-07-14 03:19:13 +00:00
|
|
|
Danish, English, Faroese, Galician, Icelandic, Irish, Italian,
|
2014-06-05 09:00:14 +00:00
|
|
|
Norwegian, Portuguese, Spanish, and Swedish.
|
|
|
|
The lack of the ligatures Dutch IJ/ij, French œ, and old-style „German“
|
|
|
|
quotation marks was considered tolerable.
|
2004-11-03 13:51:07 +00:00
|
|
|
.TP
|
|
|
|
8859-2 (Latin-2)
|
2014-06-05 09:00:14 +00:00
|
|
|
Latin-2 supports many Latin-written Central and East European
|
|
|
|
languages such as Bosnian, Croatian, Czech, German, Hungarian, Polish,
|
2004-11-03 13:51:07 +00:00
|
|
|
Slovak, and Slovene.
|
2014-06-05 09:00:14 +00:00
|
|
|
Replacing Romanian ș/ț with ş/ţ was considered tolerable.
|
2004-11-03 13:51:07 +00:00
|
|
|
.TP
|
|
|
|
8859-3 (Latin-3)
|
2014-06-05 12:25:56 +00:00
|
|
|
Latin-3 was designed to cover of Esperanto, Maltese, and Turkish, but
|
2014-06-05 09:00:14 +00:00
|
|
|
8859-9 later superseded it for Turkish.
|
2004-11-03 13:51:07 +00:00
|
|
|
.TP
|
|
|
|
8859-4 (Latin-4)
|
2014-06-05 09:00:14 +00:00
|
|
|
Latin-4 introduced letters for North European languages such as
|
2014-06-05 12:25:56 +00:00
|
|
|
Estonian, Latvian, and Lithuanian, but was superseded by 8859-10 and
|
2014-06-05 09:00:14 +00:00
|
|
|
8859-13.
|
2004-11-03 13:51:07 +00:00
|
|
|
.TP
|
|
|
|
8859-5
|
|
|
|
Cyrillic letters supporting Bulgarian, Byelorussian, Macedonian,
|
2014-06-05 09:00:14 +00:00
|
|
|
Russian, Serbian, and (almost completely) Ukrainian.
|
|
|
|
It was never widely used, see the discussion of KOI8-R/KOI8-U below.
|
2004-11-03 13:51:07 +00:00
|
|
|
.TP
|
|
|
|
8859-6
|
2014-06-05 09:00:14 +00:00
|
|
|
Was created for Arabic.
|
2007-04-12 22:42:49 +00:00
|
|
|
The 8859-6 glyph table is a fixed font of separate
|
2004-11-03 13:51:07 +00:00
|
|
|
letter forms, but a proper display engine should combine these
|
|
|
|
using the proper initial, medial, and final forms.
|
|
|
|
.TP
|
|
|
|
8859-7
|
2014-06-05 12:25:56 +00:00
|
|
|
Was created for Modern Greek in 1987, updated in 2003.
|
2004-11-03 13:51:07 +00:00
|
|
|
.TP
|
|
|
|
8859-8
|
2014-06-05 12:25:56 +00:00
|
|
|
Supports Modern Hebrew without niqud (punctuation signs).
|
2014-06-05 09:00:14 +00:00
|
|
|
Niqud and full-fledged Biblical Hebrew were outside the scope of this
|
2014-06-06 04:37:46 +00:00
|
|
|
character set.
|
2004-11-03 13:51:07 +00:00
|
|
|
.TP
|
|
|
|
8859-9 (Latin-5)
|
|
|
|
This is a variant of Latin-1 that replaces Icelandic letters with
|
|
|
|
Turkish ones.
|
|
|
|
.TP
|
|
|
|
8859-10 (Latin-6)
|
2014-06-14 06:03:45 +00:00
|
|
|
Latin-6 added the Inuit (Greenlandic) and Sami (Lappish) letters that were
|
2014-06-05 09:00:14 +00:00
|
|
|
missing in Latin-4 to cover the entire Nordic area.
|
2004-11-03 13:51:07 +00:00
|
|
|
.TP
|
|
|
|
8859-11
|
2014-06-05 09:00:14 +00:00
|
|
|
Supports the Thai alphabet and is nearly identical to the TIS-620
|
|
|
|
standard.
|
2004-11-03 13:51:07 +00:00
|
|
|
.TP
|
|
|
|
8859-12
|
2007-04-12 22:42:49 +00:00
|
|
|
This set does not exist.
|
2004-11-03 13:51:07 +00:00
|
|
|
.TP
|
|
|
|
8859-13 (Latin-7)
|
|
|
|
Supports the Baltic Rim languages; in particular, it includes Latvian
|
|
|
|
characters not found in Latin-4.
|
|
|
|
.TP
|
|
|
|
8859-14 (Latin-8)
|
2014-06-05 09:00:14 +00:00
|
|
|
This is the Celtic character set, covering Old Irish, Manx, Gaelic,
|
|
|
|
Welsh, Cornish, and Breton.
|
2004-11-03 13:51:07 +00:00
|
|
|
.TP
|
|
|
|
8859-15 (Latin-9)
|
2014-06-05 12:25:56 +00:00
|
|
|
Latin-9 is similar to the widely used Latin-1 but replaces some less
|
2014-06-05 09:00:14 +00:00
|
|
|
common symbols with the Euro sign and French and Finnish letters that
|
|
|
|
were missing in Latin-1.
|
2004-11-03 13:51:07 +00:00
|
|
|
.TP
|
|
|
|
8859-16 (Latin-10)
|
2014-06-05 09:00:14 +00:00
|
|
|
This set covers many Southeast European languages, and most
|
|
|
|
importantly supports Romanian more completely than Latin-2.
|
|
|
|
.SS KOI8-R / KOI8-U
|
|
|
|
KOI8-R is a non-ISO character set popular in Russia before Unicode.
|
|
|
|
The lower half is ASCII;
|
|
|
|
the upper is a Cyrillic character set somewhat better designed than
|
|
|
|
ISO 8859-5.
|
2014-06-05 12:25:56 +00:00
|
|
|
KOI8-U, based on KOI8-R, has better support for Ukrainian.
|
2014-06-05 09:00:14 +00:00
|
|
|
Neither of these sets are ISO-2022 compatible,
|
|
|
|
unlike the ISO-8859 series.
|
2004-11-03 13:51:07 +00:00
|
|
|
.LP
|
|
|
|
Console support for KOI8-R is available under Linux through user-mode
|
|
|
|
utilities that modify keyboard bindings and the EGA graphics table,
|
|
|
|
and employ the "user mapping" font table in the console driver.
|
2014-06-09 19:22:19 +00:00
|
|
|
.SS GB 2312
|
|
|
|
GB 2312 is a mainland Chinese national standard character set used
|
|
|
|
to express simplified Chinese.
|
|
|
|
Just like JIS X 0208, characters are
|
|
|
|
mapped into a 94x94 two-byte matrix used to construct EUC-CN.
|
|
|
|
EUC-CN
|
|
|
|
is the most important encoding for Linux and includes ASCII and
|
|
|
|
GB 2312.
|
|
|
|
Note that EUC-CN is often called as GB, GB 2312, or CN-GB.
|
|
|
|
.SS Big5
|
|
|
|
Big5 was a popular character set in Taiwan to express traditional
|
|
|
|
Chinese.
|
|
|
|
(Big5 is both a character set and an encoding.)
|
|
|
|
It is a superset of ASCII.
|
|
|
|
Non-ASCII characters are expressed in two bytes.
|
|
|
|
Bytes 0xa1-0xfe are used as leading bytes for two-byte characters.
|
|
|
|
Big5 and its extension were widely used in Taiwan and Hong Kong.
|
|
|
|
It is not ISO 2022 compliant.
|
2007-04-12 22:42:49 +00:00
|
|
|
.\" Thanks to Tomohiro KUBOTA for the following sections about
|
2004-11-03 13:51:07 +00:00
|
|
|
.\" national standards.
|
2007-06-15 19:55:07 +00:00
|
|
|
.SS JIS X 0208
|
2007-04-12 22:42:49 +00:00
|
|
|
JIS X 0208 is a Japanese national standard character set.
|
|
|
|
Though there are some more Japanese national standard character sets (like
|
|
|
|
JIS X 0201, JIS X 0212, and JIS X 0213), this is the most important one.
|
|
|
|
Characters are mapped into a 94x94 two-byte matrix,
|
|
|
|
whose each byte is in the range 0x21-0x7e.
|
|
|
|
Note that JIS X 0208 is a character set, not an encoding.
|
|
|
|
This means that JIS X 0208
|
|
|
|
itself is not used for expressing text data.
|
|
|
|
JIS X 0208 is used
|
2004-11-03 13:51:07 +00:00
|
|
|
as a component to construct encodings such as EUC-JP, Shift_JIS,
|
2007-04-12 22:42:49 +00:00
|
|
|
and ISO-2022-JP.
|
|
|
|
EUC-JP is the most important encoding for Linux
|
2014-06-05 09:00:14 +00:00
|
|
|
and includes ASCII and JIS X 0208.
|
2007-04-12 22:42:49 +00:00
|
|
|
In EUC-JP, JIS X 0208
|
2004-11-03 13:51:07 +00:00
|
|
|
characters are expressed in two bytes, each of which is the
|
|
|
|
JIS X 0208 code plus 0x80.
|
2007-06-15 19:55:07 +00:00
|
|
|
.SS KS X 1001
|
2007-04-12 22:42:49 +00:00
|
|
|
KS X 1001 is a Korean national standard character set.
|
|
|
|
Just as
|
2004-11-03 13:51:07 +00:00
|
|
|
JIS X 0208, characters are mapped into a 94x94 two-byte matrix.
|
|
|
|
KS X 1001 is used like JIS X 0208, as a component
|
|
|
|
to construct encodings such as EUC-KR, Johab, and ISO-2022-KR.
|
|
|
|
EUC-KR is the most important encoding for Linux and includes
|
2014-06-05 09:00:14 +00:00
|
|
|
ASCII and KS X 1001.
|
2007-04-12 22:42:49 +00:00
|
|
|
KS C 5601 is an older name for KS X 1001.
|
2014-06-09 19:22:19 +00:00
|
|
|
.SS ISO 2022 and ISO 4873
|
|
|
|
The ISO 2022 and 4873 standards describe a font-control model
|
|
|
|
based on VT100 practice.
|
|
|
|
This model is (partially) supported
|
|
|
|
by the Linux kernel and by
|
|
|
|
.BR xterm (1).
|
2014-06-11 09:27:37 +00:00
|
|
|
Several ISO 2022-based character encodings have been defined,
|
|
|
|
especially for Japanese.
|
2014-06-09 19:22:19 +00:00
|
|
|
.LP
|
|
|
|
There are 4 graphic character sets, called G0, G1, G2, and G3,
|
|
|
|
and one of them is the current character set for codes with
|
|
|
|
high bit zero (initially G0), and one of them is the current
|
|
|
|
character set for codes with high bit one (initially G1).
|
|
|
|
Each graphic character set has 94 or 96 characters, and is
|
|
|
|
essentially a 7-bit character set.
|
|
|
|
It uses codes either
|
|
|
|
040-0177 (041-0176) or 0240-0377 (0241-0376).
|
|
|
|
G0 always has size 94 and uses codes 041-0176.
|
|
|
|
.LP
|
|
|
|
Switching between character sets is done using the shift functions
|
|
|
|
\fB^N\fP (SO or LS1), \fB^O\fP (SI or LS0), ESC n (LS2), ESC o (LS3),
|
|
|
|
ESC N (SS2), ESC O (SS3), ESC ~ (LS1R), ESC } (LS2R), ESC | (LS3R).
|
|
|
|
The function LS\fIn\fP makes character set G\fIn\fP the current one
|
|
|
|
for codes with high bit zero.
|
|
|
|
The function LS\fIn\fPR makes character set G\fIn\fP the current one
|
|
|
|
for codes with high bit one.
|
|
|
|
The function SS\fIn\fP makes character set G\fIn\fP (\fIn\fP=2 or 3)
|
|
|
|
the current one for the next character only (regardless of the value
|
|
|
|
of its high order bit).
|
|
|
|
.LP
|
|
|
|
A 94-character set is designated as G\fIn\fP character set
|
|
|
|
by an escape sequence ESC ( xx (for G0), ESC ) xx (for G1),
|
|
|
|
ESC * xx (for G2), ESC + xx (for G3), where xx is a symbol
|
|
|
|
or a pair of symbols found in the ISO 2375 International
|
|
|
|
Register of Coded Character Sets.
|
|
|
|
For example, ESC ( @ selects the ISO 646 character set as G0,
|
|
|
|
ESC ( A selects the UK standard character set (with pound
|
|
|
|
instead of number sign), ESC ( B selects ASCII (with dollar
|
|
|
|
instead of currency sign), ESC ( M selects a character set
|
|
|
|
for African languages, ESC ( ! A selects the Cuban character
|
|
|
|
set, and so on.
|
|
|
|
.LP
|
|
|
|
A 96-character set is designated as G\fIn\fP character set
|
|
|
|
by an escape sequence ESC \- xx (for G1), ESC . xx (for G2)
|
|
|
|
or ESC / xx (for G3).
|
|
|
|
For example, ESC \- G selects the Hebrew alphabet as G1.
|
|
|
|
.LP
|
|
|
|
A multibyte character set is designated as G\fIn\fP character set
|
|
|
|
by an escape sequence ESC $ xx or ESC $ ( xx (for G0),
|
|
|
|
ESC $ ) xx (for G1), ESC $ * xx (for G2), ESC $ + xx (for G3).
|
|
|
|
For example, ESC $ ( C selects the Korean character set for G0.
|
|
|
|
The Japanese character set selected by ESC $ B has a more
|
|
|
|
recent version selected by ESC & @ ESC $ B.
|
|
|
|
.LP
|
|
|
|
ISO 4873 stipulates a narrower use of character sets, where G0
|
|
|
|
is fixed (always ASCII), so that G1, G2 and G3
|
|
|
|
can be invoked only for codes with the high order bit set.
|
|
|
|
In particular, \fB^N\fP and \fB^O\fP are not used anymore, ESC ( xx
|
|
|
|
can be used only with xx=B, and ESC ) xx, ESC * xx, ESC + xx
|
|
|
|
are equivalent to ESC \- xx, ESC . xx, ESC / xx, respectively.
|
2014-06-05 09:00:14 +00:00
|
|
|
.SS TIS-620
|
|
|
|
TIS-620 is a Thai national standard character set and a superset
|
|
|
|
of ASCII.
|
2014-06-05 12:25:56 +00:00
|
|
|
In the same fashion as the ISO 8859 series, Thai characters are mapped into
|
2007-04-12 22:42:49 +00:00
|
|
|
0xa1-0xfe.
|
2014-06-05 09:00:14 +00:00
|
|
|
.SS Unicode
|
|
|
|
Unicode (ISO 10646) is a standard which aims to unambiguously represent
|
|
|
|
every character in every human language.
|
2007-04-12 22:42:49 +00:00
|
|
|
Unicode's structure permits 20.1 bits to encode every character.
|
2014-06-14 06:03:45 +00:00
|
|
|
Since most computers don't include 20.1-bit integers, Unicode is
|
|
|
|
usually encoded as 32-bit integers internally and either a series of
|
|
|
|
16-bit integers (UTF-16) (needing two 16-bit integers only when
|
2014-06-05 09:00:14 +00:00
|
|
|
encoding certain rare characters) or a series of 8-bit bytes (UTF-8).
|
2004-11-03 13:51:07 +00:00
|
|
|
.LP
|
|
|
|
Linux represents Unicode using the 8-bit Unicode Transformation Format
|
2007-04-12 22:42:49 +00:00
|
|
|
(UTF-8).
|
|
|
|
UTF-8 is a variable length encoding of Unicode.
|
|
|
|
It uses 1
|
2004-11-03 13:51:07 +00:00
|
|
|
byte to code 7 bits, 2 bytes for 11 bits, 3 bytes for 16 bits, 4 bytes
|
|
|
|
for 21 bits, 5 bytes for 26 bits, 6 bytes for 31 bits.
|
|
|
|
.LP
|
2007-04-12 22:42:49 +00:00
|
|
|
Let 0,1,x stand for a zero, one, or arbitrary bit.
|
|
|
|
A byte 0xxxxxxx
|
2004-11-03 13:51:07 +00:00
|
|
|
stands for the Unicode 00000000 0xxxxxxx which codes the same symbol
|
2007-04-12 22:42:49 +00:00
|
|
|
as the ASCII 0xxxxxxx.
|
|
|
|
Thus, ASCII goes unchanged into UTF-8, and
|
2004-11-03 13:51:07 +00:00
|
|
|
people using only ASCII do not notice any change: not in code, and not
|
|
|
|
in file size.
|
|
|
|
.LP
|
|
|
|
A byte 110xxxxx is the start of a 2-byte code, and 110xxxxx 10yyyyyy
|
2007-04-12 22:42:49 +00:00
|
|
|
is assembled into 00000xxx xxyyyyyy.
|
|
|
|
A byte 1110xxxx is the start
|
2004-11-03 13:51:07 +00:00
|
|
|
of a 3-byte code, and 1110xxxx 10yyyyyy 10zzzzzz is assembled
|
|
|
|
into xxxxyyyy yyzzzzzz.
|
|
|
|
(When UTF-8 is used to code the 31-bit ISO 10646
|
|
|
|
then this progression continues up to 6-byte codes.)
|
|
|
|
.LP
|
2014-06-05 09:00:14 +00:00
|
|
|
For most texts in ISO-8859 character sets, this means that the
|
2007-04-12 22:42:49 +00:00
|
|
|
characters outside of ASCII are now coded with two bytes.
|
|
|
|
This tends
|
|
|
|
to expand ordinary text files by only one or two percent.
|
|
|
|
For Russian
|
2014-06-05 09:00:14 +00:00
|
|
|
or Greek texts, this expands ordinary text files by 100%, since text in
|
2007-04-12 22:42:49 +00:00
|
|
|
those languages is mostly outside of ASCII.
|
|
|
|
For Japanese users this means
|
|
|
|
that the 16-bit codes now in common use will take three bytes.
|
2014-06-14 06:03:45 +00:00
|
|
|
While there are algorithmic conversions from some character sets
|
|
|
|
(especially ISO 8859-1) to Unicode, general conversion requires
|
|
|
|
carrying around conversion tables, which can be quite large for 16-bit
|
2014-06-05 09:00:14 +00:00
|
|
|
codes.
|
2004-11-03 13:51:07 +00:00
|
|
|
.LP
|
|
|
|
Note that UTF-8 is self-synchronizing: 10xxxxxx is a tail, any other
|
2007-04-12 22:42:49 +00:00
|
|
|
byte is the head of a code.
|
|
|
|
Note that the only way ASCII bytes occur
|
|
|
|
in a UTF-8 stream, is as themselves.
|
|
|
|
In particular, there are no
|
2008-06-09 15:49:35 +00:00
|
|
|
embedded NULs (\(aq\\0\(aq) or \(aq/\(aqs that form part of some larger code.
|
2004-11-03 13:51:07 +00:00
|
|
|
.LP
|
2008-06-09 15:49:35 +00:00
|
|
|
Since ASCII, and, in particular, NUL and \(aq/\(aq, are unchanged, the
|
2007-04-12 22:42:49 +00:00
|
|
|
kernel does not notice that UTF-8 is being used.
|
|
|
|
It does not care at
|
2004-11-03 13:51:07 +00:00
|
|
|
all what the bytes it is handling stand for.
|
|
|
|
.LP
|
|
|
|
Rendering of Unicode data streams is typically handled through
|
2008-06-05 20:14:50 +00:00
|
|
|
"subfont" tables which map a subset of Unicode to glyphs.
|
2007-04-12 22:42:49 +00:00
|
|
|
Internally
|
2004-11-03 13:51:07 +00:00
|
|
|
the kernel uses Unicode to describe the subfont loaded in video RAM.
|
2014-06-14 06:03:45 +00:00
|
|
|
This means that in the Linux console in UTF-8 mode, one can use a character
|
2014-06-05 09:00:14 +00:00
|
|
|
set with 512 different symbols.
|
2014-06-05 12:25:56 +00:00
|
|
|
This is not enough for Japanese, Chinese, and
|
2004-11-03 13:51:07 +00:00
|
|
|
Korean, but it is enough for most other purposes.
|
getent.1, intro.1, time.1, _exit.2, _syscall.2, accept.2, access.2, acct.2, adjtimex.2, alarm.2, alloc_hugepages.2, arch_prctl.2, bdflush.2, bind.2, brk.2, cacheflush.2, capget.2, chdir.2, chmod.2, chown.2, chroot.2, clock_getres.2, clock_nanosleep.2, clone.2, close.2, connect.2, create_module.2, delete_module.2, dup.2, epoll_create.2, epoll_ctl.2, epoll_wait.2, eventfd.2, execve.2, exit_group.2, faccessat.2, fchmodat.2, fchownat.2, fcntl.2, flock.2, fork.2, fstatat.2, fsync.2, futex.2, futimesat.2, get_kernel_syms.2, get_robust_list.2, get_thread_area.2, getcpu.2, getdents.2, getdomainname.2, getgid.2, getgroups.2, gethostname.2, getitimer.2, getpagesize.2, getpeername.2, getpid.2, getpriority.2, getresuid.2, getrlimit.2, getrusage.2, getsid.2, getsockname.2, getsockopt.2, gettid.2, gettimeofday.2, getuid.2, getunwind.2, getxattr.2, idle.2, init_module.2, inotify_add_watch.2, inotify_init.2, inotify_rm_watch.2, intro.2, io_cancel.2, io_destroy.2, io_getevents.2, io_setup.2, io_submit.2, ioctl.2, ioctl_list.2, ioperm.2, iopl.2, ioprio_set.2, ipc.2, kcmp.2, kill.2, killpg.2, link.2, linkat.2, listen.2, listxattr.2, llseek.2, lookup_dcookie.2, lseek.2, madvise.2, migrate_pages.2, mincore.2, mkdir.2, mkdirat.2, mknod.2, mknodat.2, mlock.2, mmap.2, mmap2.2, modify_ldt.2, mount.2, move_pages.2, mprotect.2, mq_getsetattr.2, mremap.2, msgctl.2, msgget.2, msgop.2, msync.2, nanosleep.2, nfsservctl.2, nice.2, open.2, openat.2, outb.2, pause.2, pciconfig_read.2, perf_event_open.2, perfmonctl.2, personality.2, pipe.2, pivot_root.2, poll.2, posix_fadvise.2, prctl.2, pread.2, process_vm_readv.2, ptrace.2, query_module.2, quotactl.2, read.2, readahead.2, readdir.2, readlink.2, readlinkat.2, readv.2, reboot.2, recv.2, remap_file_pages.2, removexattr.2, rename.2, renameat.2, rmdir.2, rt_sigqueueinfo.2, sched_get_priority_max.2, sched_rr_get_interval.2, sched_setaffinity.2, sched_setparam.2, sched_setscheduler.2, sched_yield.2, select.2, semctl.2, semget.2, semop.2, send.2, sendfile.2, set_thread_area.2, set_tid_address.2, seteuid.2, setfsgid.2, setfsuid.2, setgid.2, setpgid.2, setresuid.2, setreuid.2, setsid.2, setuid.2, setup.2, setxattr.2, shmctl.2, shmget.2, shmop.2, shutdown.2, sigaction.2, sigaltstack.2, signal.2, signalfd.2, sigpending.2, sigprocmask.2, sigreturn.2, sigsuspend.2, sigwaitinfo.2, socket.2, socketcall.2, socketpair.2, splice.2, stat.2, statfs.2, stime.2, swapon.2, symlink.2, symlinkat.2, sync.2, sync_file_range.2, sysctl.2, sysfs.2, sysinfo.2, syslog.2, tee.2, time.2, timerfd_create.2, times.2, tkill.2, truncate.2, umask.2, umount.2, uname.2, unimplemented.2, unlink.2, unlinkat.2, uselib.2, ustat.2, utime.2, utimensat.2, vfork.2, vhangup.2, vm86.2, vmsplice.2, wait.2, wait4.2, write.2, CPU_SET.3, INFINITY.3, MB_CUR_MAX.3, MB_LEN_MAX.3, __setfpucw.3, a64l.3, abort.3, abs.3, acos.3, acosh.3, addseverity.3, adjtime.3, aio_cancel.3, aio_error.3, aio_fsync.3, aio_read.3, aio_return.3, aio_suspend.3, aio_write.3, alloca.3, argz_add.3, asin.3, asinh.3, asprintf.3, assert.3, assert_perror.3, atan.3, atan2.3, atanh.3, atexit.3, atof.3, atoi.3, backtrace.3, basename.3, bcmp.3, bcopy.3, bindresvport.3, bsd_signal.3, bsearch.3, bstring.3, btowc.3, btree.3, byteorder.3, bzero.3, cabs.3, cacos.3, cacosh.3, canonicalize_file_name.3, carg.3, casin.3, casinh.3, catan.3, catanh.3, catgets.3, catopen.3, cbrt.3, ccos.3, ccosh.3, ceil.3, cerf.3, cexp.3, cexp2.3, cfree.3, cimag.3, clearenv.3, clock.3, clock_getcpuclockid.3, clog.3, clog10.3, clog2.3, closedir.3, cmsg.3, confstr.3, conj.3, copysign.3, cos.3, cosh.3, cpow.3, cproj.3, creal.3, crypt.3, csin.3, csinh.3, csqrt.3, ctan.3, ctanh.3, ctermid.3, ctime.3, daemon.3, dbopen.3, des_crypt.3, difftime.3, dirfd.3, div.3, dl_iterate_phdr.3, dlopen.3, dprintf.3, drand48.3, drand48_r.3, dysize.3, ecvt.3, ecvt_r.3, encrypt.3, end.3, endian.3, envz_add.3, erf.3, erfc.3, err.3, errno.3, error.3, ether_aton.3, euidaccess.3, exec.3, exit.3, exp.3, exp10.3, exp2.3, expm1.3, fabs.3, fclose.3, fcloseall.3, fdim.3, fenv.3, ferror.3, fexecve.3, fflush.3, ffs.3, fgetgrent.3, fgetpwent.3, fgetwc.3, fgetws.3, finite.3, flockfile.3, floor.3, fma.3, fmax.3, fmemopen.3, fmin.3, fmod.3, fmtmsg.3, fnmatch.3, fopen.3, fpathconf.3, fpclassify.3, fpurge.3, fputwc.3, fputws.3, fread.3, frexp.3, fseek.3, fseeko.3, ftime.3, ftok.3, fts.3, ftw.3, futimes.3, fwide.3, gamma.3, gcvt.3, getaddrinfo.3, getaddrinfo_a.3, getauxval.3, getcontext.3, getcwd.3, getdate.3, getdirentries.3, getdtablesize.3, getenv.3, getfsent.3, getgrent.3, getgrent_r.3, getgrnam.3, getgrouplist.3, gethostbyname.3, gethostid.3, getipnodebyname.3, getline.3, getloadavg.3, getlogin.3, getmntent.3, getnameinfo.3, getnetent.3, getnetent_r.3, getopt.3, getpass.3, getprotoent.3, getprotoent_r.3, getpt.3, getpw.3, getpwent.3, getpwent_r.3, getpwnam.3, getrpcent.3, getrpcent_r.3, getrpcport.3, gets.3, getservent.3, getservent_r.3, getspnam.3, getttyent.3, getumask.3, getusershell.3, getutent.3, getw.3, getwchar.3, glob.3, grantpt.3, gsignal.3, hash.3, hsearch.3, hypot.3, iconv.3, iconv_close.3, iconv_open.3, ilogb.3, index.3, inet.3, inet_ntop.3, inet_pton.3, infnan.3, initgroups.3, insque.3, intro.3, isalpha.3, isatty.3, isgreater.3, iswalnum.3, iswalpha.3, iswblank.3, iswcntrl.3, iswctype.3, iswdigit.3, iswgraph.3, iswlower.3, iswprint.3, iswpunct.3, iswspace.3, iswupper.3, iswxdigit.3, j0.3, key_setsecret.3, ldexp.3, lgamma.3, lio_listio.3, localeconv.3, lockf.3, log.3, log10.3, log1p.3, log2.3, logb.3, login.3, longjmp.3, lrint.3, lround.3, lsearch.3, lseek64.3, makecontext.3, makedev.3, malloc.3, malloc_hook.3, mblen.3, mbrlen.3, mbrtowc.3, mbsinit.3, mbsnrtowcs.3, mbsrtowcs.3, mbstowcs.3, mbtowc.3, memccpy.3, memchr.3, memcmp.3, memcpy.3, memfrob.3, memmem.3, memmove.3, mempcpy.3, memset.3, mkdtemp.3, mkfifo.3, mkfifoat.3, mkstemp.3, mktemp.3, modf.3, mpool.3, mq_close.3, mq_getattr.3, mq_notify.3, mq_open.3, mq_receive.3, mq_send.3, mq_unlink.3, mtrace.3, nan.3, netlink.3, nextafter.3, nl_langinfo.3, offsetof.3, on_exit.3, opendir.3, openpty.3, perror.3, popen.3, posix_fallocate.3, posix_memalign.3, posix_openpt.3, pow.3, pow10.3, printf.3, profil.3, program_invocation_name.3, psignal.3, pthread_kill_other_threads_np.3, ptsname.3, putenv.3, putgrent.3, putpwent.3, puts.3, putwchar.3, qecvt.3, qsort.3, queue.3, raise.3, rand.3, random.3, random_r.3, rcmd.3, re_comp.3, readdir.3, realpath.3, recno.3, regex.3, remainder.3, remove.3, remquo.3, resolver.3, rewinddir.3, rexec.3, rint.3, round.3, rpc.3, rpmatch.3, rtime.3, rtnetlink.3, scalb.3, scalbln.3, scandir.3, scandirat.3, scanf.3, seekdir.3, sem_close.3, sem_destroy.3, sem_getvalue.3, sem_init.3, sem_open.3, sem_post.3, sem_unlink.3, sem_wait.3, setaliasent.3, setbuf.3, setenv.3, setjmp.3, setlocale.3, setlogmask.3, setnetgrent.3, shm_open.3, siginterrupt.3, signbit.3, significand.3, sigpause.3, sigqueue.3, sigset.3, sigsetops.3, sigvec.3, sin.3, sincos.3, sinh.3, sleep.3, sockatmark.3, sqrt.3, statvfs.3, stdarg.3, stdin.3, stdio.3, stdio_ext.3, stpcpy.3, stpncpy.3, strcasecmp.3, strcat.3, strchr.3, strcmp.3, strcoll.3, strcpy.3, strdup.3, strerror.3, strfmon.3, strfry.3, strftime.3, string.3, strlen.3, strnlen.3, strpbrk.3, strptime.3, strsep.3, strsignal.3, strspn.3, strstr.3, strtod.3, strtoimax.3, strtok.3, strtol.3, strtoul.3, strverscmp.3, strxfrm.3, swab.3, sysconf.3, syslog.3, system.3, sysv_signal.3, tan.3, tanh.3, tcgetpgrp.3, tcgetsid.3, telldir.3, tempnam.3, termios.3, tgamma.3, timegm.3, timeradd.3, tmpfile.3, tmpnam.3, toascii.3, toupper.3, towctrans.3, towlower.3, towupper.3, trunc.3, tsearch.3, ttyname.3, ttyslot.3, tzset.3, ualarm.3, ulimit.3, ungetwc.3, unlocked_stdio.3, unlockpt.3, updwtmp.3, usleep.3, wcpcpy.3, wcpncpy.3, wcrtomb.3, wcscasecmp.3, wcscat.3, wcschr.3, wcscmp.3, wcscpy.3, wcscspn.3, wcsdup.3, wcslen.3, wcsncasecmp.3, wcsncat.3, wcsncmp.3, wcsncpy.3, wcsnlen.3, wcsnrtombs.3, wcspbrk.3, wcsrchr.3, wcsrtombs.3, wcsspn.3, wcsstr.3, wcstoimax.3, wcstok.3, wcstombs.3, wcswidth.3, wctob.3, wctomb.3, wctrans.3, wctype.3, wcwidth.3, wmemchr.3, wmemcmp.3, wmemcpy.3, wmemmove.3, wmemset.3, wordexp.3, wprintf.3, xcrypt.3, xdr.3, y0.3, cciss.4, console.4, console_codes.4, console_ioctl.4, dsp56k.4, fd.4, full.4, hd.4, hpsa.4, initrd.4, intro.4, lp.4, mem.4, mouse.4, null.4, pts.4, ram.4, random.4, rtc.4, sk98lin.4, st.4, tty.4, ttyS.4, tty_ioctl.4, vcs.4, wavelan.4, acct.5, charmap.5, dir_colors.5, filesystems.5, ftpusers.5, group.5, host.conf.5, hosts.5, hosts.equiv.5, intro.5, issue.5, locale.5, motd.5, networks.5, nologin.5, nscd.conf.5, passwd.5, proc.5, protocols.5, resolv.conf.5, rpc.5, securetty.5, services.5, shells.5, termcap.5, ttytype.5, utmp.5, armscii-8.7, arp.7, ascii.7, bootparam.7, capabilities.7, charsets.7, complex.7, cp1251.7, credentials.7, ddp.7, environ.7, epoll.7, fifo.7, futex.7, glob.7, hier.7, icmp.7, inotify.7, intro.7, ip.7, ipv6.7, iso_8859-1.7, iso_8859-10.7, iso_8859-11.7, iso_8859-13.7, iso_8859-14.7, iso_8859-15.7, iso_8859-16.7, iso_8859-2.7, iso_8859-3.7, iso_8859-4.7, iso_8859-5.7, iso_8859-6.7, iso_8859-7.7, iso_8859-8.7, iso_8859-9.7, koi8-r.7, koi8-u.7, locale.7, mailaddr.7, man.7, mq_overview.7, netdevice.7, netlink.7, numa.7, packet.7, path_resolution.7, pipe.7, posixoptions.7, pthreads.7, pty.7, raw.7, regex.7, rtld-audit.7, rtnetlink.7, sem_overview.7, shm_overview.7, sigevent.7, signal.7, socket.7, standards.7, suffixes.7, svipc.7, tcp.7, termio.7, time.7, udp.7, udplite.7, unicode.7, unix.7, uri.7, utf-8.7, x25.7, nscd.8, sync.8, tzselect.8, zdump.8, zic.8: Global fix: remove unneeded double quotes in .SH headings
Signed-off-by: Michael Kerrisk <mtk.manpages@gmail.com>
2013-02-24 18:01:36 +00:00
|
|
|
.SH SEE ALSO
|
2014-06-05 09:00:14 +00:00
|
|
|
.BR iconv (1),
|
2004-11-03 13:51:07 +00:00
|
|
|
.BR console (4),
|
|
|
|
.BR ascii (7),
|
|
|
|
.BR iso_8859-1 (7),
|
|
|
|
.BR unicode (7),
|
|
|
|
.BR utf-8 (7)
|