663 lines
21 KiB
Python
Raw Normal View History

2025-03-18 00:17:27 +05:30
# -*- coding: utf-8 -*-
"""
hpack/hpack
~~~~~~~~~~~
Implements the HPACK header compression algorithm as detailed by the IETF.
"""
import collections
import logging
from .compat import to_byte
from .huffman import HuffmanDecoder, HuffmanEncoder
from .huffman_constants import (
REQUEST_CODES, REQUEST_CODES_LENGTH
)
log = logging.getLogger(__name__)
def encode_integer(integer, prefix_bits):
"""
This encodes an integer according to the wacky integer encoding rules
defined in the HPACK spec.
"""
log.debug("Encoding %d with %d bits", integer, prefix_bits)
max_number = (2 ** prefix_bits) - 1
if (integer < max_number):
return bytearray([integer]) # Seriously?
else:
elements = [max_number]
integer = integer - max_number
while integer >= 128:
elements.append((integer % 128) + 128)
integer = integer // 128 # We need integer division
elements.append(integer)
return bytearray(elements)
def decode_integer(data, prefix_bits):
"""
This decodes an integer according to the wacky integer encoding rules
defined in the HPACK spec. Returns a tuple of the decoded integer and the
number of bytes that were consumed from ``data`` in order to get that
integer.
"""
multiple = lambda index: 128 ** (index - 1)
max_number = (2 ** prefix_bits) - 1
mask = 0xFF >> (8 - prefix_bits)
index = 0
number = to_byte(data[index]) & mask
if (number == max_number):
while True:
index += 1
next_byte = to_byte(data[index])
if next_byte >= 128:
number += (next_byte - 128) * multiple(index)
else:
number += next_byte * multiple(index)
break
log.debug("Decoded %d, consumed %d bytes", number, index + 1)
return (number, index + 1)
def _to_bytes(string):
"""
Convert string to bytes.
"""
if not isinstance(string, (str, bytes)): # pragma: no cover
string = str(string)
return string if isinstance(string, bytes) else string.encode('utf-8')
def header_table_size(table):
"""
Calculates the 'size' of the header table as defined by the HTTP/2
specification.
"""
# It's phenomenally frustrating that the specification feels it is able to
# tell me how large the header table is, considering that its calculations
# assume a very particular layout that most implementations will not have.
# I appreciate it's an attempt to prevent DoS attacks by sending lots of
# large headers in the header table, but it seems like a better approach
# would be to limit the size of headers. Ah well.
return sum(32 + len(name) + len(value) for name, value in table)
class Encoder(object):
"""
An HPACK encoder object. This object takes HTTP headers and emits encoded
HTTP/2 header blocks.
"""
# This is the static table of header fields.
static_table = [
(b':authority', b''),
(b':method', b'GET'),
(b':method', b'POST'),
(b':path', b'/'),
(b':path', b'/index.html'),
(b':scheme', b'http'),
(b':scheme', b'https'),
(b':status', b'200'),
(b':status', b'204'),
(b':status', b'206'),
(b':status', b'304'),
(b':status', b'400'),
(b':status', b'404'),
(b':status', b'500'),
(b'accept-charset', b''),
(b'accept-encoding', b'gzip, deflate'),
(b'accept-language', b''),
(b'accept-ranges', b''),
(b'accept', b''),
(b'access-control-allow-origin', b''),
(b'age', b''),
(b'allow', b''),
(b'authorization', b''),
(b'cache-control', b''),
(b'content-disposition', b''),
(b'content-encoding', b''),
(b'content-language', b''),
(b'content-length', b''),
(b'content-location', b''),
(b'content-range', b''),
(b'content-type', b''),
(b'cookie', b''),
(b'date', b''),
(b'etag', b''),
(b'expect', b''),
(b'expires', b''),
(b'from', b''),
(b'host', b''),
(b'if-match', b''),
(b'if-modified-since', b''),
(b'if-none-match', b''),
(b'if-range', b''),
(b'if-unmodified-since', b''),
(b'last-modified', b''),
(b'link', b''),
(b'location', b''),
(b'max-forwards', b''),
(b'proxy-authenticate', b''),
(b'proxy-authorization', b''),
(b'range', b''),
(b'referer', b''),
(b'refresh', b''),
(b'retry-after', b''),
(b'server', b''),
(b'set-cookie', b''),
(b'strict-transport-security', b''),
(b'transfer-encoding', b''),
(b'user-agent', b''),
(b'vary', b''),
(b'via', b''),
(b'www-authenticate', b''),
]
def __init__(self):
self.header_table = collections.deque()
self._header_table_size = 4096 # This value set by the standard.
self.huffman_coder = HuffmanEncoder(
REQUEST_CODES, REQUEST_CODES_LENGTH
)
# We need to keep track of whether the header table size has been
# changed since we last encoded anything. If it has, we need to signal
# that change in the HPACK block.
self._table_size_changed = False
@property
def header_table_size(self):
return self._header_table_size
@header_table_size.setter
def header_table_size(self, value):
log.debug(
"Setting header table size to %d from %d",
value,
self._header_table_size
)
# If the new value is larger than the current one, no worries!
# Otherwise, we may need to shrink the header table.
if value < self._header_table_size:
current_size = header_table_size(self.header_table)
while value < current_size:
header = self.header_table.pop()
n, v = header
current_size -= (
32 + len(n) + len(v)
)
log.debug(
"Removed %s: %s from the encoder header table", n, v
)
if value != self._header_table_size:
self._table_size_changed = True
self._header_table_size = value
def encode(self, headers, huffman=True):
"""
Takes a set of headers and encodes them into a HPACK-encoded header
block.
Transforming the headers into a header block is a procedure that can
be modeled as a chain or pipe. First, the headers are encoded. This
encoding can be done a number of ways. If the header name-value pair
are already in the header table we can represent them using the indexed
representation: the same is true if they are in the static table.
Otherwise, a literal representation will be used.
"""
log.debug("HPACK encoding %s", headers)
header_block = []
# Turn the headers into a list of tuples if possible. This is the
# natural way to interact with them in HPACK.
if isinstance(headers, dict):
headers = headers.items()
# Next, walk across the headers and turn them all into bytestrings.
headers = [(_to_bytes(n), _to_bytes(v)) for n, v in headers]
# Before we begin, if the header table size has been changed we need
# to signal that appropriately.
if self._table_size_changed:
header_block.append(self._encode_table_size_change())
self._table_size_changed = False
# We can now encode each header in the block.
header_block.extend(
(self.add(header, huffman) for header in headers)
)
header_block = b''.join(header_block)
log.debug("Encoded header block to %s", header_block)
return header_block
def add(self, to_add, huffman=False):
"""
This function takes a header key-value tuple and serializes it.
"""
log.debug("Adding %s to the header table", to_add)
name, value = to_add
# Search for a matching header in the header table.
match = self.matching_header(name, value)
if match is None:
# Not in the header table. Encode using the literal syntax,
# and add it to the header table.
encoded = self._encode_literal(name, value, True, huffman)
self._add_to_header_table(to_add)
return encoded
# The header is in the table, break out the values. If we matched
# perfectly, we can use the indexed representation: otherwise we
# can use the indexed literal.
index, perfect = match
if perfect:
# Indexed representation.
encoded = self._encode_indexed(index)
else:
# Indexed literal. We are going to add header to the
# header table unconditionally. It is a future todo to
# filter out headers which are known to be ineffective for
# indexing since they just take space in the table and
# pushed out other valuable headers.
encoded = self._encode_indexed_literal(index, value, huffman)
self._add_to_header_table(to_add)
return encoded
def matching_header(self, name, value):
"""
Scans the header table and the static table. Returns a tuple, where the
first value is the index of the match, and the second is whether there
was a full match or not. Prefers full matches to partial ones.
Upsettingly, the header table is one-indexed, not zero-indexed.
"""
partial_match = None
static_table_len = len(Encoder.static_table)
for (i, (n, v)) in enumerate(Encoder.static_table):
if n == name:
if v == value:
return (i + 1, Encoder.static_table[i])
elif partial_match is None:
partial_match = (i + 1, None)
for (i, (n, v)) in enumerate(self.header_table):
if n == name:
if v == value:
return (i + static_table_len + 1, self.header_table[i])
elif partial_match is None:
partial_match = (i + static_table_len + 1, None)
return partial_match
def _add_to_header_table(self, header):
"""
Adds a header to the header table, evicting old ones if necessary.
"""
# Be optimistic: add the header straight away.
self.header_table.appendleft(header)
# Now, work out how big the header table is.
actual_size = header_table_size(self.header_table)
# Loop and remove whatever we need to.
while actual_size > self.header_table_size:
header = self.header_table.pop()
n, v = header
actual_size -= (
32 + len(n) + len(v)
)
log.debug("Evicted %s: %s from the header table", n, v)
def _encode_indexed(self, index):
"""
Encodes a header using the indexed representation.
"""
field = encode_integer(index, 7)
field[0] = field[0] | 0x80 # we set the top bit
return bytes(field)
def _encode_literal(self, name, value, indexing, huffman=False):
"""
Encodes a header with a literal name and literal value. If ``indexing``
is True, the header will be added to the header table: otherwise it
will not.
"""
prefix = b'\x40' if indexing else b'\x00'
if huffman:
name = self.huffman_coder.encode(name)
value = self.huffman_coder.encode(value)
name_len = encode_integer(len(name), 7)
value_len = encode_integer(len(value), 7)
if huffman:
name_len[0] |= 0x80
value_len[0] |= 0x80
return b''.join([prefix, bytes(name_len), name, bytes(value_len), value])
def _encode_indexed_literal(self, index, value, huffman=False):
"""
Encodes a header with an indexed name and a literal value and performs
incremental indexing.
"""
prefix = encode_integer(index, 6)
prefix[0] |= 0x40
if huffman:
value = self.huffman_coder.encode(value)
value_len = encode_integer(len(value), 7)
if huffman:
value_len[0] |= 0x80
return b''.join([bytes(prefix), bytes(value_len), value])
def _encode_table_size_change(self):
"""
Produces the encoded form of a header table size change context update.
"""
size_bytes = encode_integer(self.header_table_size, 5)
size_bytes[0] |= 0x20
return bytes(size_bytes)
class Decoder(object):
"""
An HPACK decoder object.
"""
static_table = [
(b':authority', b''),
(b':method', b'GET'),
(b':method', b'POST'),
(b':path', b'/'),
(b':path', b'/index.html'),
(b':scheme', b'http'),
(b':scheme', b'https'),
(b':status', b'200'),
(b':status', b'204'),
(b':status', b'206'),
(b':status', b'304'),
(b':status', b'400'),
(b':status', b'404'),
(b':status', b'500'),
(b'accept-charset', b''),
(b'accept-encoding', b'gzip, deflate'),
(b'accept-language', b''),
(b'accept-ranges', b''),
(b'accept', b''),
(b'access-control-allow-origin', b''),
(b'age', b''),
(b'allow', b''),
(b'authorization', b''),
(b'cache-control', b''),
(b'content-disposition', b''),
(b'content-encoding', b''),
(b'content-language', b''),
(b'content-length', b''),
(b'content-location', b''),
(b'content-range', b''),
(b'content-type', b''),
(b'cookie', b''),
(b'date', b''),
(b'etag', b''),
(b'expect', b''),
(b'expires', b''),
(b'from', b''),
(b'host', b''),
(b'if-match', b''),
(b'if-modified-since', b''),
(b'if-none-match', b''),
(b'if-range', b''),
(b'if-unmodified-since', b''),
(b'last-modified', b''),
(b'link', b''),
(b'location', b''),
(b'max-forwards', b''),
(b'proxy-authenticate', b''),
(b'proxy-authorization', b''),
(b'range', b''),
(b'referer', b''),
(b'refresh', b''),
(b'retry-after', b''),
(b'server', b''),
(b'set-cookie', b''),
(b'strict-transport-security', b''),
(b'transfer-encoding', b''),
(b'user-agent', b''),
(b'vary', b''),
(b'via', b''),
(b'www-authenticate', b''),
]
def __init__(self):
self.header_table = collections.deque()
self._header_table_size = 4096 # This value set by the standard.
self.huffman_coder = HuffmanDecoder(
REQUEST_CODES, REQUEST_CODES_LENGTH
)
@property
def header_table_size(self):
return self._header_table_size
@header_table_size.setter
def header_table_size(self, value):
log.debug(
"Resizing decoder header table to %d from %d",
value,
self._header_table_size
)
# If the new value is larger than the current one, no worries!
# Otherwise, we may need to shrink the header table.
if value < self._header_table_size:
current_size = header_table_size(self.header_table)
while value < current_size:
header = self.header_table.pop()
n, v = header
current_size -= (
32 + len(n) + len(v)
)
log.debug("Evicting %s: %s from the header table", n, v)
self._header_table_size = value
def decode(self, data):
"""
Takes an HPACK-encoded header block and decodes it into a header set.
"""
log.debug("Decoding %s", data)
headers = []
data_len = len(data)
current_index = 0
while current_index < data_len:
# Work out what kind of header we're decoding.
# If the high bit is 1, it's an indexed field.
current = to_byte(data[current_index])
indexed = bool(current & 0x80)
# Otherwise, if the second-highest bit is 1 it's a field that does
# alter the header table.
literal_index = bool(current & 0x40)
# Otherwise, if the third-highest bit is 1 it's an encoding context
# update.
encoding_update = bool(current & 0x20)
if indexed:
header, consumed = self._decode_indexed(data[current_index:])
elif literal_index:
# It's a literal header that does affect the header table.
header, consumed = self._decode_literal_index(
data[current_index:]
)
elif encoding_update:
# It's an update to the encoding context.
consumed = self._update_encoding_context(data)
header = None
else:
# It's a literal header that does not affect the header table.
header, consumed = self._decode_literal_no_index(
data[current_index:]
)
if header:
headers.append(header)
current_index += consumed
return [(n.decode('utf-8'), v.decode('utf-8')) for n, v in headers]
def _add_to_header_table(self, new_header):
"""
Adds a header to the header table, evicting old ones if necessary.
"""
# Be optimistic: add the header straight away.
self.header_table.appendleft(new_header)
# Now, work out how big the header table is.
actual_size = header_table_size(self.header_table)
# Loop and remove whatever we need to.
while actual_size > self.header_table_size:
header = self.header_table.pop()
n, v = header
actual_size -= (
32 + len(n) + len(v)
)
log.debug("Evicting %s: %s from the header table", n, v)
def _update_encoding_context(self, data):
"""
Handles a byte that updates the encoding context.
"""
# We've been asked to resize the header table.
new_size, consumed = decode_integer(data, 5)
self.header_table_size = new_size
return consumed
def _decode_indexed(self, data):
"""
Decodes a header represented using the indexed representation.
"""
index, consumed = decode_integer(data, 7)
index -= 1 # Because this idiot table is 1-indexed. Ugh.
if index >= len(Decoder.static_table):
index -= len(Decoder.static_table)
header = self.header_table[index]
else:
header = Decoder.static_table[index]
log.debug("Decoded %s, consumed %d", header, consumed)
return header, consumed
def _decode_literal_no_index(self, data):
return self._decode_literal(data, False)
def _decode_literal_index(self, data):
return self._decode_literal(data, True)
def _decode_literal(self, data, should_index):
"""
Decodes a header represented with a literal.
"""
total_consumed = 0
# When should_index is true, if the low six bits of the first byte are
# nonzero, the header name is indexed.
# When should_index is false, if the low four bits of the first byte
# are nonzero the header name is indexed.
if should_index:
indexed_name = to_byte(data[0]) & 0x3F
name_len = 6
else:
indexed_name = to_byte(data[0]) & 0x0F
name_len = 4
if indexed_name:
# Indexed header name.
index, consumed = decode_integer(data, name_len)
index -= 1
if index >= len(Decoder.static_table):
index -= len(Decoder.static_table)
name = self.header_table[index][0]
else:
name = Decoder.static_table[index][0]
total_consumed = consumed
length = 0
else:
# Literal header name. The first byte was consumed, so we need to
# move forward.
data = data[1:]
length, consumed = decode_integer(data, 7)
name = data[consumed:consumed + length]
if to_byte(data[0]) & 0x80:
name = self.huffman_coder.decode(name)
total_consumed = consumed + length + 1 # Since we moved forward 1.
data = data[consumed + length:]
# The header value is definitely length-based.
length, consumed = decode_integer(data, 7)
value = data[consumed:consumed + length]
if to_byte(data[0]) & 0x80:
value = self.huffman_coder.decode(value)
# Updated the total consumed length.
total_consumed += length + consumed
# If we've been asked to index this, add it to the header table.
header = (name, value)
if should_index:
self._add_to_header_table(header)
log.debug(
"Decoded %s, total consumed %d bytes, indexed %s",
header,
total_consumed,
should_index
)
return header, total_consumed