200 lines
7.6 KiB
C
200 lines
7.6 KiB
C
|
/* --- Internal Unicode Format -------------------------------------------- */
|
||
|
|
||
|
|
||
|
/* Py_UNICODE was the native Unicode storage format (code unit) used by
|
||
|
Python and represents a single Unicode element in the Unicode type.
|
||
|
With PEP 393, Py_UNICODE is deprecated and replaced with a
|
||
|
typedef to wchar_t. */
|
||
|
|
||
|
#define PY_UNICODE_TYPE wchar_t
|
||
|
typedef wchar_t Py_UNICODE;
|
||
|
|
||
|
/* Py_UCS4 and Py_UCS2 are typedefs for the respective
|
||
|
unicode representations. */
|
||
|
typedef unsigned int Py_UCS4;
|
||
|
typedef unsigned short Py_UCS2;
|
||
|
typedef unsigned char Py_UCS1;
|
||
|
|
||
|
/* --- Unicode Type ------------------------------------------------------- */
|
||
|
|
||
|
typedef struct {
|
||
|
/*
|
||
|
SSTATE_NOT_INTERNED (0)
|
||
|
SSTATE_INTERNED_MORTAL (1)
|
||
|
SSTATE_INTERNED_IMMORTAL (2)
|
||
|
|
||
|
If interned != SSTATE_NOT_INTERNED, the two references from the
|
||
|
dictionary to this object are *not* counted in ob_refcnt.
|
||
|
*/
|
||
|
unsigned char interned;
|
||
|
/* Character size:
|
||
|
|
||
|
- PyUnicode_WCHAR_KIND (0):
|
||
|
|
||
|
* character type = wchar_t (16 or 32 bits, depending on the
|
||
|
platform)
|
||
|
|
||
|
- PyUnicode_1BYTE_KIND (1):
|
||
|
|
||
|
* character type = Py_UCS1 (8 bits, unsigned)
|
||
|
* all characters are in the range U+0000-U+00FF (latin1)
|
||
|
* if ascii is set, all characters are in the range U+0000-U+007F
|
||
|
(ASCII), otherwise at least one character is in the range
|
||
|
U+0080-U+00FF
|
||
|
|
||
|
- PyUnicode_2BYTE_KIND (2):
|
||
|
|
||
|
* character type = Py_UCS2 (16 bits, unsigned)
|
||
|
* all characters are in the range U+0000-U+FFFF (BMP)
|
||
|
* at least one character is in the range U+0100-U+FFFF
|
||
|
|
||
|
- PyUnicode_4BYTE_KIND (4):
|
||
|
|
||
|
* character type = Py_UCS4 (32 bits, unsigned)
|
||
|
* all characters are in the range U+0000-U+10FFFF
|
||
|
* at least one character is in the range U+10000-U+10FFFF
|
||
|
*/
|
||
|
unsigned char kind;
|
||
|
/* Compact is with respect to the allocation scheme. Compact unicode
|
||
|
objects only require one memory block while non-compact objects use
|
||
|
one block for the PyUnicodeObject struct and another for its data
|
||
|
buffer. */
|
||
|
unsigned char compact;
|
||
|
/* The string only contains characters in the range U+0000-U+007F (ASCII)
|
||
|
and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
|
||
|
set, use the PyASCIIObject structure. */
|
||
|
unsigned char ascii;
|
||
|
/* The ready flag indicates whether the object layout is initialized
|
||
|
completely. This means that this is either a compact object, or
|
||
|
the data pointer is filled out. The bit is redundant, and helps
|
||
|
to minimize the test in PyUnicode_IS_READY(). */
|
||
|
unsigned char ready;
|
||
|
/* Padding to ensure that PyUnicode_DATA() is always aligned to
|
||
|
4 bytes (see issue #19537 on m68k). */
|
||
|
/* not on PyPy */
|
||
|
} _PyASCIIObject_state_t;
|
||
|
|
||
|
/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
|
||
|
structure. state.ascii and state.compact are set, and the data
|
||
|
immediately follow the structure. utf8_length and wstr_length can be found
|
||
|
in the length field; the utf8 pointer is equal to the data pointer. */
|
||
|
typedef struct {
|
||
|
/* There are 4 forms of Unicode strings:
|
||
|
|
||
|
- compact ascii:
|
||
|
|
||
|
* structure = PyASCIIObject
|
||
|
* test: PyUnicode_IS_COMPACT_ASCII(op)
|
||
|
* kind = PyUnicode_1BYTE_KIND
|
||
|
* compact = 1
|
||
|
* ascii = 1
|
||
|
* ready = 1
|
||
|
* (length is the length of the utf8 and wstr strings)
|
||
|
* (data starts just after the structure)
|
||
|
* (since ASCII is decoded from UTF-8, the utf8 string are the data)
|
||
|
|
||
|
- compact:
|
||
|
|
||
|
* structure = PyCompactUnicodeObject
|
||
|
* test: PyUnicode_IS_COMPACT(op) && !PyUnicode_IS_ASCII(op)
|
||
|
* kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
|
||
|
PyUnicode_4BYTE_KIND
|
||
|
* compact = 1
|
||
|
* ready = 1
|
||
|
* ascii = 0
|
||
|
* utf8 is not shared with data
|
||
|
* utf8_length = 0 if utf8 is NULL
|
||
|
* wstr is shared with data and wstr_length=length
|
||
|
if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
|
||
|
or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_t)=4
|
||
|
* wstr_length = 0 if wstr is NULL
|
||
|
* (data starts just after the structure)
|
||
|
|
||
|
- legacy string, not ready:
|
||
|
|
||
|
* structure = PyUnicodeObject
|
||
|
* test: kind == PyUnicode_WCHAR_KIND
|
||
|
* length = 0 (use wstr_length)
|
||
|
* hash = -1
|
||
|
* kind = PyUnicode_WCHAR_KIND
|
||
|
* compact = 0
|
||
|
* ascii = 0
|
||
|
* ready = 0
|
||
|
* interned = SSTATE_NOT_INTERNED
|
||
|
* wstr is not NULL
|
||
|
* data.any is NULL
|
||
|
* utf8 is NULL
|
||
|
* utf8_length = 0
|
||
|
|
||
|
- legacy string, ready:
|
||
|
|
||
|
* structure = PyUnicodeObject structure
|
||
|
* test: !PyUnicode_IS_COMPACT(op) && kind != PyUnicode_WCHAR_KIND
|
||
|
* kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
|
||
|
PyUnicode_4BYTE_KIND
|
||
|
* compact = 0
|
||
|
* ready = 1
|
||
|
* data.any is not NULL
|
||
|
* utf8 is shared and utf8_length = length with data.any if ascii = 1
|
||
|
* utf8_length = 0 if utf8 is NULL
|
||
|
* wstr is shared with data.any and wstr_length = length
|
||
|
if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
|
||
|
or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4
|
||
|
* wstr_length = 0 if wstr is NULL
|
||
|
|
||
|
Compact strings use only one memory block (structure + characters),
|
||
|
whereas legacy strings use one block for the structure and one block
|
||
|
for characters.
|
||
|
|
||
|
Legacy strings are created by PyUnicode_FromUnicode() and
|
||
|
PyUnicode_FromStringAndSize(NULL, size) functions. They become ready
|
||
|
when PyUnicode_READY() is called.
|
||
|
|
||
|
See also _PyUnicode_CheckConsistency().
|
||
|
*/
|
||
|
PyObject_HEAD
|
||
|
Py_ssize_t length; /* Number of code points in the string */
|
||
|
//Py_hash_t hash; /* Hash value; -1 if not set */
|
||
|
_PyASCIIObject_state_t state;
|
||
|
wchar_t *wstr; /* wchar_t representation (null-terminated) */
|
||
|
} PyASCIIObject;
|
||
|
|
||
|
/* Non-ASCII strings allocated through PyUnicode_New use the
|
||
|
PyCompactUnicodeObject structure. state.compact is set, and the data
|
||
|
immediately follow the structure. */
|
||
|
typedef struct {
|
||
|
PyASCIIObject _base;
|
||
|
Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the
|
||
|
* terminating \0. */
|
||
|
char *utf8; /* UTF-8 representation (null-terminated) */
|
||
|
Py_ssize_t wstr_length; /* Number of code points in wstr, possible
|
||
|
* surrogates count as two code points. */
|
||
|
} PyCompactUnicodeObject;
|
||
|
|
||
|
/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the
|
||
|
PyUnicodeObject structure. The actual string data is initially in the wstr
|
||
|
block, and copied into the data block using _PyUnicode_Ready. */
|
||
|
typedef struct {
|
||
|
PyCompactUnicodeObject _base;
|
||
|
void* data; /* Canonical, smallest-form Unicode buffer */
|
||
|
} PyUnicodeObject;
|
||
|
|
||
|
|
||
|
/* --- Flexible String Representation Helper Macros (PEP 393) -------------- */
|
||
|
|
||
|
/* Values for PyASCIIObject.state: */
|
||
|
|
||
|
/* Interning state. */
|
||
|
#define SSTATE_NOT_INTERNED 0
|
||
|
#define SSTATE_INTERNED_MORTAL 1
|
||
|
#define SSTATE_INTERNED_IMMORTAL 2
|
||
|
|
||
|
/* --- Constants ---------------------------------------------------------- */
|
||
|
|
||
|
/* This Unicode character will be used as replacement character during
|
||
|
decoding if the errors argument is set to "replace". Note: the
|
||
|
Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
|
||
|
Unicode 3.0. */
|
||
|
|
||
|
#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD)
|