Small update / WIP.
I made some progress on the 0x158000 packed block in BU40N 1.00.
The original table split still looks correct:
Code: Select all
0x158008 288-byte literal/length code-length table
0x158128 32-byte distance code-length table
0x158148 compressed bitstream
I now have an experimental Python decoder that expands the BU40N 1.00 block from:
Code: Select all
compressed: 0x72464
decompressed: 0x4e98f
The format appears to be a custom canonical-Huffman + LZ77-style scheme, but not standard DEFLATE.
Current working assumptions:
Code: Select all
bitstream: MSB-first
Huffman: canonical codes, bit-reversed for lookup
symbol 256: literal zero, not EOF
symbols 257-287: length symbols
distance: raw distance symbol + 1
This produces an output file of the advertised decompressed size. As a sanity check, the decoded output contains:
0x27b76: CAETDVD_59110933
So it is definitely producing structured data from the packed block.
Important caveat: I do not think this is 100% solved yet. The output contains plausible Thumb-looking code and strings, but it does not currently decompile cleanly as one linear ARM/Thumb image. There may still be a small semantic difference in the decoder, a relocation/fixup step, a second transform, or simply mixed code/data/microcode in the decompressed payload.
Here is the current Python script:
Code: Select all
#!/usr/bin/env python3
import argparse
import struct
from pathlib import Path
LBASE = [
3, 4, 5, 6, 7, 8, 9, 10,
11, 13, 15, 17, 19, 23, 27, 31,
35, 43, 51, 59, 67, 83, 99, 115,
131, 163, 195, 227, 258, 258, 258,
]
LEXT = [
0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 2, 2, 2, 2,
3, 3, 3, 3, 4, 4, 4, 4,
5, 5, 5, 5, 0, 0, 0,
]
class BitReader:
def __init__(self, data: bytes):
self.data = data
self.bitpos = 0
def read(self, n: int) -> int:
value = 0
for i in range(n):
if self.bitpos >= len(self.data) * 8:
raise EOFError("ran out of compressed input")
byte = self.data[self.bitpos >> 3]
bit = (byte >> (7 - (self.bitpos & 7))) & 1
value |= bit << i
self.bitpos += 1
return value
def reverse_bits(value: int, width: int) -> int:
out = 0
for _ in range(width):
out = (out << 1) | (value & 1)
value >>= 1
return out
def build_canonical_table(lengths: bytes) -> dict[tuple[int, int], int]:
counts: dict[int, int] = {}
for length in lengths:
if length:
counts[length] = counts.get(length, 0) + 1
code = 0
next_code: dict[int, int] = {}
for bits in range(1, max(counts.keys(), default=0) + 1):
code = (code + counts.get(bits - 1, 0)) << 1
next_code[bits] = code
table: dict[tuple[int, int], int] = {}
for symbol, length in enumerate(lengths):
if not length:
continue
canonical = next_code[length]
next_code[length] += 1
# Required for this stream.
stored_code = reverse_bits(canonical, length)
table[(stored_code, length)] = symbol
return table
def decode_symbol(br: BitReader, table: dict[tuple[int, int], int]) -> int:
code = 0
for length in range(1, 32):
code |= br.read(1) << (length - 1)
symbol = table.get((code, length))
if symbol is not None:
return symbol
raise ValueError(f"bad Huffman code at bit {br.bitpos}")
def decompress_partition(firmware: bytes, offset: int = 0x158000) -> tuple[bytes, int, int, int]:
compressed_size, output_size = struct.unpack_from("<II", firmware, offset)
lit_table_off = offset + 8
dist_table_off = lit_table_off + 288
stream_off = dist_table_off + 32
lit_lengths = firmware[lit_table_off:lit_table_off + 288]
dist_lengths = firmware[dist_table_off:dist_table_off + 32]
stream = firmware[stream_off:stream_off + compressed_size]
lit_tree = build_canonical_table(lit_lengths)
dist_tree = build_canonical_table(dist_lengths)
br = BitReader(stream)
out = bytearray()
while len(out) < output_size:
symbol = decode_symbol(br, lit_tree)
if symbol < 256:
out.append(symbol)
continue
# In this format, symbol 256 behaves as literal zero.
if symbol == 256:
out.append(0)
continue
length_index = symbol - 257
if length_index < 0 or length_index >= len(LBASE):
raise ValueError(
f"bad length symbol {symbol} at output={len(out):#x}, bit={br.bitpos}"
)
length = LBASE[length_index]
extra_bits = LEXT[length_index]
if extra_bits:
length += br.read(extra_bits)
distance_symbol = decode_symbol(br, dist_tree)
# Unlike DEFLATE, this currently appears to use raw distance symbols.
distance = distance_symbol + 1
if distance <= 0 or distance > len(out):
raise ValueError(
f"invalid distance {distance} at output={len(out):#x}, bit={br.bitpos}"
)
for _ in range(length):
out.append(out[-distance])
if len(out) >= output_size:
break
return bytes(out), compressed_size, output_size, br.bitpos
def main() -> None:
parser = argparse.ArgumentParser(
description="Experimental BU40N 1.00 0x158000 partition decoder"
)
parser.add_argument("firmware", help="input BU40N firmware .bin")
parser.add_argument("-o", "--output", default="decoded_158000.bin")
parser.add_argument("--offset", default="0x158000")
args = parser.parse_args()
firmware = Path(args.firmware).read_bytes()
offset = int(args.offset, 0)
decoded, compressed_size, output_size, bits_used = decompress_partition(
firmware, offset
)
Path(args.output).write_bytes(decoded)
print(f"partition offset: {offset:#x}")
print(f"compressed size: {compressed_size:#x}")
print(f"decompressed size: {len(decoded):#x}/{output_size:#x}")
print(f"bits consumed: {bits_used}")
print(f"wrote: {args.output}")
if __name__ == "__main__":
main()
Run with:
Code: Select all
python3 decode_158000.py BU40N_1.00_stock.bin
partition offset: 0x158000
compressed size: 0x72464
decompressed size: 0x4e98f/0x4e98f
bits consumed: 1632522
wrote: decoded_158000.bin
strings -a -tx decoded_158000.bin | grep CAETDVD
Expected string:
If anyone has a RAM dump of this region after the drive has decompressed it, comparing that against this output would probably show exactly what is still missing.