from filetypes.base import *
import malcat
import re

OBJ_RE = r"[\x0d\x0a][\x0d\x0a]?(\d+)(?:\s|\x00)\d+\sobj\b"
ENDOBJ_RE = r"[\x0d\x0a][\x0d\x0a]?endobj\b"
XREF_RE = r"(?<=[\x0d\x0a])\bxref\b"
TRAILER_RE = r"(?<=[\x0d\x0a])\btrailer\b"
STARTXREF_RE = r"(?<=[\x0d\x0a])\bstartxref\b"

class PDFAnalyzer(FileTypeAnalyzer):
    category = malcat.FileType.DOCUMENT
    name = "PDF"
    regexp = r"%PDF(?:-\d+\.\d*)?(?:[\x0d\x0a]+%[^\x0d\x0a]{1,32}){0,16}(?=[\x0d\x0a]+\d+(?:\s|\x00)\d+\sobj\b)"

    def __init__(self):
        FileTypeAnalyzer.__init__(self)
        self.objects = {}

    def parse(self, hint):
        off, sz = self.search(PDFAnalyzer.regexp)
        if off != 0:
            raise FatalError
        yield Bytes(sz, name="Signature", category=Type.HEADER)
        self.confirm()
        version = 0
        while self.remaining():
            section_start = self.tell()
            next_eof, sz_eof = self.search(r"%%EOF[\x0d\x0a]*", self.tell())
            if not next_eof:
                next_eof = self.size()
            next_xref, _ = self.search(XREF_RE, self.tell(), next_eof - self.tell())
            next_trailer, _ = self.search(TRAILER_RE, self.tell(), next_eof - self.tell())
            next_startxref, _ = self.search(STARTXREF_RE, self.tell(), next_eof - self.tell())
            stop_object = min(filter(None, (next_xref, next_trailer, next_startxref, next_eof)))
            #parse objects
            first_obj = True
            while self.tell() < stop_object:
                next_obj, sz = self.search(OBJ_RE, self.tell(), next_eof - self.tell())
                if not next_obj:
                    break
                m = re.match(OBJ_RE.encode("ascii"), self.read(next_obj, sz))
                if not m:
                    raise FatalError
                object_id = int(m.group(1).decode("ascii"))
                self.jump(next_obj + m.start(1))
                if first_obj:
                    if self.tell() - section_start > 65536:
                        raise FatalError("First object too far away from header")
                    section_start = self.tell()
                    first_obj = False
                next_endobj, sz = self.search(ENDOBJ_RE, self.tell(), next_eof - self.tell())
                if not next_endobj:
                    next_endobj = stop_object
                else:
                    next_endobj += sz
                self.objects[object_id] = (self.tell(), next_endobj - self.tell())
                yield Bytes(next_endobj - self.tell(), name="Obj{:d}".format(object_id), category=Type.DATA)
            #parse xref
            if next_xref:
                self.jump(next_xref)
                stop_xref = min(filter(None, (next_trailer, next_startxref, next_eof)))
                yield Bytes(stop_xref - self.tell(), name="XREF", category=Type.FIXUP)
            #parse trailer
            if next_trailer:
                self.jump(next_trailer)
                stop_trailer = min(filter(None, (next_startxref, next_eof)))
                yield Bytes(stop_trailer - self.tell(), name="TRAILER", category=Type.HEADER)
            #parse startxref
            if next_startxref:
                self.jump(next_startxref)
                yield Bytes(next_eof - self.tell(), name="STARTXREF", category=Type.FIXUP)
            self.jump(next_eof + sz_eof)
            if not first_obj:
                self.add_section("version{:d}".format(version), section_start, self.tell() - section_start)
                self.confirm()
                version += 1
        if not self.objects:
            raise FatalError("No object")

