1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165 |
- The .xz File Format
- ===================
- Version 1.1.0 (2022-12-11)
- 0. Preface
- 0.1. Notices and Acknowledgements
- 0.2. Getting the Latest Version
- 0.3. Version History
- 1. Conventions
- 1.1. Byte and Its Representation
- 1.2. Multibyte Integers
- 2. Overall Structure of .xz File
- 2.1. Stream
- 2.1.1. Stream Header
- 2.1.1.1. Header Magic Bytes
- 2.1.1.2. Stream Flags
- 2.1.1.3. CRC32
- 2.1.2. Stream Footer
- 2.1.2.1. CRC32
- 2.1.2.2. Backward Size
- 2.1.2.3. Stream Flags
- 2.1.2.4. Footer Magic Bytes
- 2.2. Stream Padding
- 3. Block
- 3.1. Block Header
- 3.1.1. Block Header Size
- 3.1.2. Block Flags
- 3.1.3. Compressed Size
- 3.1.4. Uncompressed Size
- 3.1.5. List of Filter Flags
- 3.1.6. Header Padding
- 3.1.7. CRC32
- 3.2. Compressed Data
- 3.3. Block Padding
- 3.4. Check
- 4. Index
- 4.1. Index Indicator
- 4.2. Number of Records
- 4.3. List of Records
- 4.3.1. Unpadded Size
- 4.3.2. Uncompressed Size
- 4.4. Index Padding
- 4.5. CRC32
- 5. Filter Chains
- 5.1. Alignment
- 5.2. Security
- 5.3. Filters
- 5.3.1. LZMA2
- 5.3.2. Branch/Call/Jump Filters for Executables
- 5.3.3. Delta
- 5.3.3.1. Format of the Encoded Output
- 5.4. Custom Filter IDs
- 5.4.1. Reserved Custom Filter ID Ranges
- 6. Cyclic Redundancy Checks
- 7. References
- 0. Preface
- This document describes the .xz file format (filename suffix
- ".xz", MIME type "application/x-xz"). It is intended that this
- this format replace the old .lzma format used by LZMA SDK and
- LZMA Utils.
- 0.1. Notices and Acknowledgements
- This file format was designed by Lasse Collin
- <lasse.collin@tukaani.org> and Igor Pavlov.
- Special thanks for helping with this document goes to
- Ville Koskinen. Thanks for helping with this document goes to
- Mark Adler, H. Peter Anvin, Mikko Pouru, and Lars Wirzenius.
- This document has been put into the public domain.
- 0.2. Getting the Latest Version
- The latest official version of this document can be downloaded
- from <http://tukaani.org/xz/xz-file-format.txt>.
- Specific versions of this document have a filename
- xz-file-format-X.Y.Z.txt where X.Y.Z is the version number.
- For example, the version 1.0.0 of this document is available
- at <http://tukaani.org/xz/xz-file-format-1.0.0.txt>.
- 0.3. Version History
- Version Date Description
- 1.1.0 2022-12-11 Added ARM64 filter and clarified 32-bit
- ARM endianness in Section 5.3.2,
- language improvements in Section 5.4
- 1.0.4 2009-08-27 Language improvements in Sections 1.2,
- 2.1.1.2, 3.1.1, 3.1.2, and 5.3.1
- 1.0.3 2009-06-05 Spelling fixes in Sections 5.1 and 5.4
- 1.0.2 2009-06-04 Typo fixes in Sections 4 and 5.3.1
- 1.0.1 2009-06-01 Typo fix in Section 0.3 and minor
- clarifications to Sections 2, 2.2,
- 3.3, 4.4, and 5.3.2
- 1.0.0 2009-01-14 The first official version
- 1. Conventions
- The key words "MUST", "MUST NOT", "REQUIRED", "SHOULD",
- "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this
- document are to be interpreted as described in [RFC-2119].
- Indicating a warning means displaying a message, returning
- appropriate exit status, or doing something else to let the
- user know that something worth warning occurred. The operation
- SHOULD still finish if a warning is indicated.
- Indicating an error means displaying a message, returning
- appropriate exit status, or doing something else to let the
- user know that something prevented successfully finishing the
- operation. The operation MUST be aborted once an error has
- been indicated.
- 1.1. Byte and Its Representation
- In this document, byte is always 8 bits.
- A "null byte" has all bits unset. That is, the value of a null
- byte is 0x00.
- To represent byte blocks, this document uses notation that
- is similar to the notation used in [RFC-1952]:
- +-------+
- | Foo | One byte.
- +-------+
- +---+---+
- | Foo | Two bytes; that is, some of the vertical bars
- +---+---+ can be missing.
- +=======+
- | Foo | Zero or more bytes.
- +=======+
- In this document, a boxed byte or a byte sequence declared
- using this notation is called "a field". The example field
- above would be called "the Foo field" or plain "Foo".
- If there are many fields, they may be split to multiple lines.
- This is indicated with an arrow ("--->"):
- +=====+
- | Foo |
- +=====+
- +=====+
- ---> | Bar |
- +=====+
- The above is equivalent to this:
- +=====+=====+
- | Foo | Bar |
- +=====+=====+
- 1.2. Multibyte Integers
- Multibyte integers of static length, such as CRC values,
- are stored in little endian byte order (least significant
- byte first).
- When smaller values are more likely than bigger values (for
- example file sizes), multibyte integers are encoded in a
- variable-length representation:
- - Numbers in the range [0, 127] are copied as is, and take
- one byte of space.
- - Bigger numbers will occupy two or more bytes. All but the
- last byte of the multibyte representation have the highest
- (eighth) bit set.
- For now, the value of the variable-length integers is limited
- to 63 bits, which limits the encoded size of the integer to
- nine bytes. These limits may be increased in the future if
- needed.
- The following C code illustrates encoding and decoding of
- variable-length integers. The functions return the number of
- bytes occupied by the integer (1-9), or zero on error.
- #include <stddef.h>
- #include <inttypes.h>
- size_t
- encode(uint8_t buf[static 9], uint64_t num)
- {
- if (num > UINT64_MAX / 2)
- return 0;
- size_t i = 0;
- while (num >= 0x80) {
- buf[i++] = (uint8_t)(num) | 0x80;
- num >>= 7;
- }
- buf[i++] = (uint8_t)(num);
- return i;
- }
- size_t
- decode(const uint8_t buf[], size_t size_max, uint64_t *num)
- {
- if (size_max == 0)
- return 0;
- if (size_max > 9)
- size_max = 9;
- *num = buf[0] & 0x7F;
- size_t i = 0;
- while (buf[i++] & 0x80) {
- if (i >= size_max || buf[i] == 0x00)
- return 0;
- *num |= (uint64_t)(buf[i] & 0x7F) << (i * 7);
- }
- return i;
- }
- 2. Overall Structure of .xz File
- A standalone .xz files consist of one or more Streams which may
- have Stream Padding between or after them:
- +========+================+========+================+
- | Stream | Stream Padding | Stream | Stream Padding | ...
- +========+================+========+================+
- The sizes of Stream and Stream Padding are always multiples
- of four bytes, thus the size of every valid .xz file MUST be
- a multiple of four bytes.
- While a typical file contains only one Stream and no Stream
- Padding, a decoder handling standalone .xz files SHOULD support
- files that have more than one Stream or Stream Padding.
- In contrast to standalone .xz files, when the .xz file format
- is used as an internal part of some other file format or
- communication protocol, it usually is expected that the decoder
- stops after the first Stream, and doesn't look for Stream
- Padding or possibly other Streams.
- 2.1. Stream
- +-+-+-+-+-+-+-+-+-+-+-+-+=======+=======+ +=======+
- | Stream Header | Block | Block | ... | Block |
- +-+-+-+-+-+-+-+-+-+-+-+-+=======+=======+ +=======+
- +=======+-+-+-+-+-+-+-+-+-+-+-+-+
- ---> | Index | Stream Footer |
- +=======+-+-+-+-+-+-+-+-+-+-+-+-+
- All the above fields have a size that is a multiple of four. If
- Stream is used as an internal part of another file format, it
- is RECOMMENDED to make the Stream start at an offset that is
- a multiple of four bytes.
- Stream Header, Index, and Stream Footer are always present in
- a Stream. The maximum size of the Index field is 16 GiB (2^34).
- There are zero or more Blocks. The maximum number of Blocks is
- limited only by the maximum size of the Index field.
- Total size of a Stream MUST be less than 8 EiB (2^63 bytes).
- The same limit applies to the total amount of uncompressed
- data stored in a Stream.
- If an implementation supports handling .xz files with multiple
- concatenated Streams, it MAY apply the above limits to the file
- as a whole instead of limiting per Stream basis.
- 2.1.1. Stream Header
- +---+---+---+---+---+---+-------+------+--+--+--+--+
- | Header Magic Bytes | Stream Flags | CRC32 |
- +---+---+---+---+---+---+-------+------+--+--+--+--+
- 2.1.1.1. Header Magic Bytes
- The first six (6) bytes of the Stream are so called Header
- Magic Bytes. They can be used to identify the file type.
- Using a C array and ASCII:
- const uint8_t HEADER_MAGIC[6]
- = { 0xFD, '7', 'z', 'X', 'Z', 0x00 };
- In plain hexadecimal:
- FD 37 7A 58 5A 00
- Notes:
- - The first byte (0xFD) was chosen so that the files cannot
- be erroneously detected as being in .lzma format, in which
- the first byte is in the range [0x00, 0xE0].
- - The sixth byte (0x00) was chosen to prevent applications
- from misdetecting the file as a text file.
- If the Header Magic Bytes don't match, the decoder MUST
- indicate an error.
- 2.1.1.2. Stream Flags
- The first byte of Stream Flags is always a null byte. In the
- future, this byte may be used to indicate a new Stream version
- or other Stream properties.
- The second byte of Stream Flags is a bit field:
- Bit(s) Mask Description
- 0-3 0x0F Type of Check (see Section 3.4):
- ID Size Check name
- 0x00 0 bytes None
- 0x01 4 bytes CRC32
- 0x02 4 bytes (Reserved)
- 0x03 4 bytes (Reserved)
- 0x04 8 bytes CRC64
- 0x05 8 bytes (Reserved)
- 0x06 8 bytes (Reserved)
- 0x07 16 bytes (Reserved)
- 0x08 16 bytes (Reserved)
- 0x09 16 bytes (Reserved)
- 0x0A 32 bytes SHA-256
- 0x0B 32 bytes (Reserved)
- 0x0C 32 bytes (Reserved)
- 0x0D 64 bytes (Reserved)
- 0x0E 64 bytes (Reserved)
- 0x0F 64 bytes (Reserved)
- 4-7 0xF0 Reserved for future use; MUST be zero for now.
- Implementations SHOULD support at least the Check IDs 0x00
- (None) and 0x01 (CRC32). Supporting other Check IDs is
- OPTIONAL. If an unsupported Check is used, the decoder SHOULD
- indicate a warning or error.
- If any reserved bit is set, the decoder MUST indicate an error.
- It is possible that there is a new field present which the
- decoder is not aware of, and can thus parse the Stream Header
- incorrectly.
- 2.1.1.3. CRC32
- The CRC32 is calculated from the Stream Flags field. It is
- stored as an unsigned 32-bit little endian integer. If the
- calculated value does not match the stored one, the decoder
- MUST indicate an error.
- The idea is that Stream Flags would always be two bytes, even
- if new features are needed. This way old decoders will be able
- to verify the CRC32 calculated from Stream Flags, and thus
- distinguish between corrupt files (CRC32 doesn't match) and
- files that the decoder doesn't support (CRC32 matches but
- Stream Flags has reserved bits set).
- 2.1.2. Stream Footer
- +-+-+-+-+---+---+---+---+-------+------+----------+---------+
- | CRC32 | Backward Size | Stream Flags | Footer Magic Bytes |
- +-+-+-+-+---+---+---+---+-------+------+----------+---------+
- 2.1.2.1. CRC32
- The CRC32 is calculated from the Backward Size and Stream Flags
- fields. It is stored as an unsigned 32-bit little endian
- integer. If the calculated value does not match the stored one,
- the decoder MUST indicate an error.
- The reason to have the CRC32 field before the Backward Size and
- Stream Flags fields is to keep the four-byte fields aligned to
- a multiple of four bytes.
- 2.1.2.2. Backward Size
- Backward Size is stored as a 32-bit little endian integer,
- which indicates the size of the Index field as multiple of
- four bytes, minimum value being four bytes:
- real_backward_size = (stored_backward_size + 1) * 4;
- If the stored value does not match the real size of the Index
- field, the decoder MUST indicate an error.
- Using a fixed-size integer to store Backward Size makes
- it slightly simpler to parse the Stream Footer when the
- application needs to parse the Stream backwards.
- 2.1.2.3. Stream Flags
- This is a copy of the Stream Flags field from the Stream
- Header. The information stored to Stream Flags is needed
- when parsing the Stream backwards. The decoder MUST compare
- the Stream Flags fields in both Stream Header and Stream
- Footer, and indicate an error if they are not identical.
- 2.1.2.4. Footer Magic Bytes
- As the last step of the decoding process, the decoder MUST
- verify the existence of Footer Magic Bytes. If they don't
- match, an error MUST be indicated.
- Using a C array and ASCII:
- const uint8_t FOOTER_MAGIC[2] = { 'Y', 'Z' };
- In hexadecimal:
- 59 5A
- The primary reason to have Footer Magic Bytes is to make
- it easier to detect incomplete files quickly, without
- uncompressing. If the file does not end with Footer Magic Bytes
- (excluding Stream Padding described in Section 2.2), it cannot
- be undamaged, unless someone has intentionally appended garbage
- after the end of the Stream.
- 2.2. Stream Padding
- Only the decoders that support decoding of concatenated Streams
- MUST support Stream Padding.
- Stream Padding MUST contain only null bytes. To preserve the
- four-byte alignment of consecutive Streams, the size of Stream
- Padding MUST be a multiple of four bytes. Empty Stream Padding
- is allowed. If these requirements are not met, the decoder MUST
- indicate an error.
- Note that non-empty Stream Padding is allowed at the end of the
- file; there doesn't need to be a new Stream after non-empty
- Stream Padding. This can be convenient in certain situations
- [GNU-tar].
- The possibility of Stream Padding MUST be taken into account
- when designing an application that parses Streams backwards,
- and the application supports concatenated Streams.
- 3. Block
- +==============+=================+===============+=======+
- | Block Header | Compressed Data | Block Padding | Check |
- +==============+=================+===============+=======+
- 3.1. Block Header
- +-------------------+-------------+=================+
- | Block Header Size | Block Flags | Compressed Size |
- +-------------------+-------------+=================+
- +===================+======================+
- ---> | Uncompressed Size | List of Filter Flags |
- +===================+======================+
- +================+--+--+--+--+
- ---> | Header Padding | CRC32 |
- +================+--+--+--+--+
- 3.1.1. Block Header Size
- This field overlaps with the Index Indicator field (see
- Section 4.1).
- This field contains the size of the Block Header field,
- including the Block Header Size field itself. Valid values are
- in the range [0x01, 0xFF], which indicate the size of the Block
- Header as multiples of four bytes, minimum size being eight
- bytes:
- real_header_size = (encoded_header_size + 1) * 4;
- If a Block Header bigger than 1024 bytes is needed in the
- future, a new field can be added between the Block Header and
- Compressed Data fields. The presence of this new field would
- be indicated in the Block Header field.
- 3.1.2. Block Flags
- The Block Flags field is a bit field:
- Bit(s) Mask Description
- 0-1 0x03 Number of filters (1-4)
- 2-5 0x3C Reserved for future use; MUST be zero for now.
- 6 0x40 The Compressed Size field is present.
- 7 0x80 The Uncompressed Size field is present.
- If any reserved bit is set, the decoder MUST indicate an error.
- It is possible that there is a new field present which the
- decoder is not aware of, and can thus parse the Block Header
- incorrectly.
- 3.1.3. Compressed Size
- This field is present only if the appropriate bit is set in
- the Block Flags field (see Section 3.1.2).
- The Compressed Size field contains the size of the Compressed
- Data field, which MUST be non-zero. Compressed Size is stored
- using the encoding described in Section 1.2. If the Compressed
- Size doesn't match the size of the Compressed Data field, the
- decoder MUST indicate an error.
- 3.1.4. Uncompressed Size
- This field is present only if the appropriate bit is set in
- the Block Flags field (see Section 3.1.2).
- The Uncompressed Size field contains the size of the Block
- after uncompressing. Uncompressed Size is stored using the
- encoding described in Section 1.2. If the Uncompressed Size
- does not match the real uncompressed size, the decoder MUST
- indicate an error.
- Storing the Compressed Size and Uncompressed Size fields serves
- several purposes:
- - The decoder knows how much memory it needs to allocate
- for a temporary buffer in multithreaded mode.
- - Simple error detection: wrong size indicates a broken file.
- - Seeking forwards to a specific location in streamed mode.
- It should be noted that the only reliable way to determine
- the real uncompressed size is to uncompress the Block,
- because the Block Header and Index fields may contain
- (intentionally or unintentionally) invalid information.
- 3.1.5. List of Filter Flags
- +================+================+ +================+
- | Filter 0 Flags | Filter 1 Flags | ... | Filter n Flags |
- +================+================+ +================+
- The number of Filter Flags fields is stored in the Block Flags
- field (see Section 3.1.2).
- The format of each Filter Flags field is as follows:
- +===========+====================+===================+
- | Filter ID | Size of Properties | Filter Properties |
- +===========+====================+===================+
- Both Filter ID and Size of Properties are stored using the
- encoding described in Section 1.2. Size of Properties indicates
- the size of the Filter Properties field as bytes. The list of
- officially defined Filter IDs and the formats of their Filter
- Properties are described in Section 5.3.
- Filter IDs greater than or equal to 0x4000_0000_0000_0000
- (2^62) are reserved for implementation-specific internal use.
- These Filter IDs MUST never be used in List of Filter Flags.
- 3.1.6. Header Padding
- This field contains as many null byte as it is needed to make
- the Block Header have the size specified in Block Header Size.
- If any of the bytes are not null bytes, the decoder MUST
- indicate an error. It is possible that there is a new field
- present which the decoder is not aware of, and can thus parse
- the Block Header incorrectly.
- 3.1.7. CRC32
- The CRC32 is calculated over everything in the Block Header
- field except the CRC32 field itself. It is stored as an
- unsigned 32-bit little endian integer. If the calculated
- value does not match the stored one, the decoder MUST indicate
- an error.
- By verifying the CRC32 of the Block Header before parsing the
- actual contents allows the decoder to distinguish between
- corrupt and unsupported files.
- 3.2. Compressed Data
- The format of Compressed Data depends on Block Flags and List
- of Filter Flags. Excluding the descriptions of the simplest
- filters in Section 5.3, the format of the filter-specific
- encoded data is out of scope of this document.
- 3.3. Block Padding
- Block Padding MUST contain 0-3 null bytes to make the size of
- the Block a multiple of four bytes. This can be needed when
- the size of Compressed Data is not a multiple of four. If any
- of the bytes in Block Padding are not null bytes, the decoder
- MUST indicate an error.
- 3.4. Check
- The type and size of the Check field depends on which bits
- are set in the Stream Flags field (see Section 2.1.1.2).
- The Check, when used, is calculated from the original
- uncompressed data. If the calculated Check does not match the
- stored one, the decoder MUST indicate an error. If the selected
- type of Check is not supported by the decoder, it SHOULD
- indicate a warning or error.
- 4. Index
- +-----------------+===================+
- | Index Indicator | Number of Records |
- +-----------------+===================+
- +=================+===============+-+-+-+-+
- ---> | List of Records | Index Padding | CRC32 |
- +=================+===============+-+-+-+-+
- Index serves several purposes. Using it, one can
- - verify that all Blocks in a Stream have been processed;
- - find out the uncompressed size of a Stream; and
- - quickly access the beginning of any Block (random access).
- 4.1. Index Indicator
- This field overlaps with the Block Header Size field (see
- Section 3.1.1). The value of Index Indicator is always 0x00.
- 4.2. Number of Records
- This field indicates how many Records there are in the List
- of Records field, and thus how many Blocks there are in the
- Stream. The value is stored using the encoding described in
- Section 1.2. If the decoder has decoded all the Blocks of the
- Stream, and then notices that the Number of Records doesn't
- match the real number of Blocks, the decoder MUST indicate an
- error.
- 4.3. List of Records
- List of Records consists of as many Records as indicated by the
- Number of Records field:
- +========+========+
- | Record | Record | ...
- +========+========+
- Each Record contains information about one Block:
- +===============+===================+
- | Unpadded Size | Uncompressed Size |
- +===============+===================+
- If the decoder has decoded all the Blocks of the Stream, it
- MUST verify that the contents of the Records match the real
- Unpadded Size and Uncompressed Size of the respective Blocks.
- Implementation hint: It is possible to verify the Index with
- constant memory usage by calculating for example SHA-256 of
- both the real size values and the List of Records, then
- comparing the hash values. Implementing this using
- non-cryptographic hash like CRC32 SHOULD be avoided unless
- small code size is important.
- If the decoder supports random-access reading, it MUST verify
- that Unpadded Size and Uncompressed Size of every completely
- decoded Block match the sizes stored in the Index. If only
- partial Block is decoded, the decoder MUST verify that the
- processed sizes don't exceed the sizes stored in the Index.
- 4.3.1. Unpadded Size
- This field indicates the size of the Block excluding the Block
- Padding field. That is, Unpadded Size is the size of the Block
- Header, Compressed Data, and Check fields. Unpadded Size is
- stored using the encoding described in Section 1.2. The value
- MUST never be zero; with the current structure of Blocks, the
- actual minimum value for Unpadded Size is five.
- Implementation note: Because the size of the Block Padding
- field is not included in Unpadded Size, calculating the total
- size of a Stream or doing random-access reading requires
- calculating the actual size of the Blocks by rounding Unpadded
- Sizes up to the next multiple of four.
- The reason to exclude Block Padding from Unpadded Size is to
- ease making a raw copy of Compressed Data without Block
- Padding. This can be useful, for example, if someone wants
- to convert Streams to some other file format quickly.
- 4.3.2. Uncompressed Size
- This field indicates the Uncompressed Size of the respective
- Block as bytes. The value is stored using the encoding
- described in Section 1.2.
- 4.4. Index Padding
- This field MUST contain 0-3 null bytes to pad the Index to
- a multiple of four bytes. If any of the bytes are not null
- bytes, the decoder MUST indicate an error.
- 4.5. CRC32
- The CRC32 is calculated over everything in the Index field
- except the CRC32 field itself. The CRC32 is stored as an
- unsigned 32-bit little endian integer. If the calculated
- value does not match the stored one, the decoder MUST indicate
- an error.
- 5. Filter Chains
- The Block Flags field defines how many filters are used. When
- more than one filter is used, the filters are chained; that is,
- the output of one filter is the input of another filter. The
- following figure illustrates the direction of data flow.
- v Uncompressed Data ^
- | Filter 0 |
- Encoder | Filter 1 | Decoder
- | Filter n |
- v Compressed Data ^
- 5.1. Alignment
- Alignment of uncompressed input data is usually the job of
- the application producing the data. For example, to get the
- best results, an archiver tool should make sure that all
- PowerPC executable files in the archive stream start at
- offsets that are multiples of four bytes.
- Some filters, for example LZMA2, can be configured to take
- advantage of specified alignment of input data. Note that
- taking advantage of aligned input can be beneficial also when
- a filter is not the first filter in the chain. For example,
- if you compress PowerPC executables, you may want to use the
- PowerPC filter and chain that with the LZMA2 filter. Because
- not only the input but also the output alignment of the PowerPC
- filter is four bytes, it is now beneficial to set LZMA2
- settings so that the LZMA2 encoder can take advantage of its
- four-byte-aligned input data.
- The output of the last filter in the chain is stored to the
- Compressed Data field, which is is guaranteed to be aligned
- to a multiple of four bytes relative to the beginning of the
- Stream. This can increase
- - speed, if the filtered data is handled multiple bytes at
- a time by the filter-specific encoder and decoder,
- because accessing aligned data in computer memory is
- usually faster; and
- - compression ratio, if the output data is later compressed
- with an external compression tool.
- 5.2. Security
- If filters would be allowed to be chained freely, it would be
- possible to create malicious files, that would be very slow to
- decode. Such files could be used to create denial of service
- attacks.
- Slow files could occur when multiple filters are chained:
- v Compressed input data
- | Filter 1 decoder (last filter)
- | Filter 0 decoder (non-last filter)
- v Uncompressed output data
- The decoder of the last filter in the chain produces a lot of
- output from little input. Another filter in the chain takes the
- output of the last filter, and produces very little output
- while consuming a lot of input. As a result, a lot of data is
- moved inside the filter chain, but the filter chain as a whole
- gets very little work done.
- To prevent this kind of slow files, there are restrictions on
- how the filters can be chained. These restrictions MUST be
- taken into account when designing new filters.
- The maximum number of filters in the chain has been limited to
- four, thus there can be at maximum of three non-last filters.
- Of these three non-last filters, only two are allowed to change
- the size of the data.
- The non-last filters, that change the size of the data, MUST
- have a limit how much the decoder can compress the data: the
- decoder SHOULD produce at least n bytes of output when the
- filter is given 2n bytes of input. This limit is not
- absolute, but significant deviations MUST be avoided.
- The above limitations guarantee that if the last filter in the
- chain produces 4n bytes of output, the chain as a whole will
- produce at least n bytes of output.
- 5.3. Filters
- 5.3.1. LZMA2
- LZMA (Lempel-Ziv-Markov chain-Algorithm) is a general-purpose
- compression algorithm with high compression ratio and fast
- decompression. LZMA is based on LZ77 and range coding
- algorithms.
- LZMA2 is an extension on top of the original LZMA. LZMA2 uses
- LZMA internally, but adds support for flushing the encoder,
- uncompressed chunks, eases stateful decoder implementations,
- and improves support for multithreading. Thus, the plain LZMA
- will not be supported in this file format.
- Filter ID: 0x21
- Size of Filter Properties: 1 byte
- Changes size of data: Yes
- Allow as a non-last filter: No
- Allow as the last filter: Yes
- Preferred alignment:
- Input data: Adjustable to 1/2/4/8/16 byte(s)
- Output data: 1 byte
- The format of the one-byte Filter Properties field is as
- follows:
- Bits Mask Description
- 0-5 0x3F Dictionary Size
- 6-7 0xC0 Reserved for future use; MUST be zero for now.
- Dictionary Size is encoded with one-bit mantissa and five-bit
- exponent. The smallest dictionary size is 4 KiB and the biggest
- is 4 GiB.
- Raw value Mantissa Exponent Dictionary size
- 0 2 11 4 KiB
- 1 3 11 6 KiB
- 2 2 12 8 KiB
- 3 3 12 12 KiB
- 4 2 13 16 KiB
- 5 3 13 24 KiB
- 6 2 14 32 KiB
- ... ... ... ...
- 35 3 27 768 MiB
- 36 2 28 1024 MiB
- 37 3 29 1536 MiB
- 38 2 30 2048 MiB
- 39 3 30 3072 MiB
- 40 2 31 4096 MiB - 1 B
- Instead of having a table in the decoder, the dictionary size
- can be decoded using the following C code:
- const uint8_t bits = get_dictionary_flags() & 0x3F;
- if (bits > 40)
- return DICTIONARY_TOO_BIG; // Bigger than 4 GiB
- uint32_t dictionary_size;
- if (bits == 40) {
- dictionary_size = UINT32_MAX;
- } else {
- dictionary_size = 2 | (bits & 1);
- dictionary_size <<= bits / 2 + 11;
- }
- 5.3.2. Branch/Call/Jump Filters for Executables
- These filters convert relative branch, call, and jump
- instructions to their absolute counterparts in executable
- files. This conversion increases redundancy and thus
- compression ratio.
- Size of Filter Properties: 0 or 4 bytes
- Changes size of data: No
- Allow as a non-last filter: Yes
- Allow as the last filter: No
- Below is the list of filters in this category. The alignment
- is the same for both input and output data.
- Filter ID Alignment Description
- 0x04 1 byte x86 filter (BCJ)
- 0x05 4 bytes PowerPC (big endian) filter
- 0x06 16 bytes IA64 filter
- 0x07 4 bytes ARM filter [1]
- 0x08 2 bytes ARM Thumb filter [1]
- 0x09 4 bytes SPARC filter
- 0x0A 4 bytes ARM64 filter [2]
- [1] These are for little endian instruction encoding.
- This must not be confused with data endianness.
- A processor configured for big endian data access
- may still use little endian instruction encoding.
- The filters don't care about the data endianness.
- [2] 4096-byte alignment gives the best results
- because the address in the ADRP instruction
- is a multiple of 4096 bytes.
- If the size of Filter Properties is four bytes, the Filter
- Properties field contains the start offset used for address
- conversions. It is stored as an unsigned 32-bit little endian
- integer. The start offset MUST be a multiple of the alignment
- of the filter as listed in the table above; if it isn't, the
- decoder MUST indicate an error. If the size of Filter
- Properties is zero, the start offset is zero.
- Setting the start offset may be useful if an executable has
- multiple sections, and there are many cross-section calls.
- Taking advantage of this feature usually requires usage of
- the Subblock filter, whose design is not complete yet.
- 5.3.3. Delta
- The Delta filter may increase compression ratio when the value
- of the next byte correlates with the value of an earlier byte
- at specified distance.
- Filter ID: 0x03
- Size of Filter Properties: 1 byte
- Changes size of data: No
- Allow as a non-last filter: Yes
- Allow as the last filter: No
- Preferred alignment:
- Input data: 1 byte
- Output data: Same as the original input data
- The Properties byte indicates the delta distance, which can be
- 1-256 bytes backwards from the current byte: 0x00 indicates
- distance of 1 byte and 0xFF distance of 256 bytes.
- 5.3.3.1. Format of the Encoded Output
- The code below illustrates both encoding and decoding with
- the Delta filter.
- // Distance is in the range [1, 256].
- const unsigned int distance = get_properties_byte() + 1;
- uint8_t pos = 0;
- uint8_t delta[256];
- memset(delta, 0, sizeof(delta));
- while (1) {
- const int byte = read_byte();
- if (byte == EOF)
- break;
- uint8_t tmp = delta[(uint8_t)(distance + pos)];
- if (is_encoder) {
- tmp = (uint8_t)(byte) - tmp;
- delta[pos] = (uint8_t)(byte);
- } else {
- tmp = (uint8_t)(byte) + tmp;
- delta[pos] = tmp;
- }
- write_byte(tmp);
- --pos;
- }
- 5.4. Custom Filter IDs
- If a developer wants to use custom Filter IDs, there are two
- choices. The first choice is to contact Lasse Collin and ask
- him to allocate a range of IDs for the developer.
- The second choice is to generate a 40-bit random integer
- which the developer can use as a personal Developer ID.
- To minimize the risk of collisions, Developer ID has to be
- a randomly generated integer, not manually selected "hex word".
- The following command, which works on many free operating
- systems, can be used to generate Developer ID:
- dd if=/dev/urandom bs=5 count=1 | hexdump
- The developer can then use the Developer ID to create unique
- (well, hopefully unique) Filter IDs.
- Bits Mask Description
- 0-15 0x0000_0000_0000_FFFF Filter ID
- 16-55 0x00FF_FFFF_FFFF_0000 Developer ID
- 56-62 0x3F00_0000_0000_0000 Static prefix: 0x3F
- The resulting 63-bit integer will use 9 bytes of space when
- stored using the encoding described in Section 1.2. To get
- a shorter ID, see the beginning of this Section how to
- request a custom ID range.
- 5.4.1. Reserved Custom Filter ID Ranges
- Range Description
- 0x0000_0300 - 0x0000_04FF Reserved to ease .7z compatibility
- 0x0002_0000 - 0x0007_FFFF Reserved to ease .7z compatibility
- 0x0200_0000 - 0x07FF_FFFF Reserved to ease .7z compatibility
- 6. Cyclic Redundancy Checks
- There are several incompatible variations to calculate CRC32
- and CRC64. For simplicity and clarity, complete examples are
- provided to calculate the checks as they are used in this file
- format. Implementations MAY use different code as long as it
- gives identical results.
- The program below reads data from standard input, calculates
- the CRC32 and CRC64 values, and prints the calculated values
- as big endian hexadecimal strings to standard output.
- #include <stddef.h>
- #include <inttypes.h>
- #include <stdio.h>
- uint32_t crc32_table[256];
- uint64_t crc64_table[256];
- void
- init(void)
- {
- static const uint32_t poly32 = UINT32_C(0xEDB88320);
- static const uint64_t poly64
- = UINT64_C(0xC96C5795D7870F42);
- for (size_t i = 0; i < 256; ++i) {
- uint32_t crc32 = i;
- uint64_t crc64 = i;
- for (size_t j = 0; j < 8; ++j) {
- if (crc32 & 1)
- crc32 = (crc32 >> 1) ^ poly32;
- else
- crc32 >>= 1;
- if (crc64 & 1)
- crc64 = (crc64 >> 1) ^ poly64;
- else
- crc64 >>= 1;
- }
- crc32_table[i] = crc32;
- crc64_table[i] = crc64;
- }
- }
- uint32_t
- crc32(const uint8_t *buf, size_t size, uint32_t crc)
- {
- crc = ~crc;
- for (size_t i = 0; i < size; ++i)
- crc = crc32_table[buf[i] ^ (crc & 0xFF)]
- ^ (crc >> 8);
- return ~crc;
- }
- uint64_t
- crc64(const uint8_t *buf, size_t size, uint64_t crc)
- {
- crc = ~crc;
- for (size_t i = 0; i < size; ++i)
- crc = crc64_table[buf[i] ^ (crc & 0xFF)]
- ^ (crc >> 8);
- return ~crc;
- }
- int
- main()
- {
- init();
- uint32_t value32 = 0;
- uint64_t value64 = 0;
- uint64_t total_size = 0;
- uint8_t buf[8192];
- while (1) {
- const size_t buf_size
- = fread(buf, 1, sizeof(buf), stdin);
- if (buf_size == 0)
- break;
- total_size += buf_size;
- value32 = crc32(buf, buf_size, value32);
- value64 = crc64(buf, buf_size, value64);
- }
- printf("Bytes: %" PRIu64 "\n", total_size);
- printf("CRC-32: 0x%08" PRIX32 "\n", value32);
- printf("CRC-64: 0x%016" PRIX64 "\n", value64);
- return 0;
- }
- 7. References
- LZMA SDK - The original LZMA implementation
- http://7-zip.org/sdk.html
- LZMA Utils - LZMA adapted to POSIX-like systems
- http://tukaani.org/lzma/
- XZ Utils - The next generation of LZMA Utils
- http://tukaani.org/xz/
- [RFC-1952]
- GZIP file format specification version 4.3
- http://www.ietf.org/rfc/rfc1952.txt
- - Notation of byte boxes in section "2.1. Overall conventions"
- [RFC-2119]
- Key words for use in RFCs to Indicate Requirement Levels
- http://www.ietf.org/rfc/rfc2119.txt
- [GNU-tar]
- GNU tar 1.21 manual
- http://www.gnu.org/software/tar/manual/html_node/Blocking-Factor.html
- - Node 9.4.2 "Blocking Factor", paragraph that begins
- "gzip will complain about trailing garbage"
- - Note that this URL points to the latest version of the
- manual, and may some day not contain the note which is in
- 1.21. For the exact version of the manual, download GNU
- tar 1.21: ftp://ftp.gnu.org/pub/gnu/tar/tar-1.21.tar.gz
|