Subversion Repositories programming

Rev

Rev 358 | Blame | Compare with Previous | Last modification | View Log | RSS feed

"""Simple Par2 parsing class

This program is part of rarslave, an open-source program for
automatically checking, repairing, and extracting files, primarily
those which are downloaded from usenet.

Visit https://svn.irasnyder.com/svn/programming/rarslave for the
latest version.
"""

__author__ = "Ira W. Snyder (devel@irasnyder.com)"
__copyright__ = "Copyright (c) 2005,2006 Ira W. Snyder (devel@irasnyder.com)"
__license__ = "GNU GPL v2 (or, at your option, any later version"

################################################################################
# The PAR2 Parser Module
#
# Copyright 2006, Ira W. Snyder (devel@irasnyder.com)
# License: GNU General Public License v2 (or, at your option, any later version)
#
# This was mostly "stolen" (read: borrowed, the GPL rocks) from cfv.
# See http://cfv.sourceforge.net/ for a copy.
################################################################################

import struct, errno
import os, re

class Par2Parser (object):
        def __init__ (self, par2_filename):
                """Constructor for the Par2Parser class.
                   
                   This will automatically perform all of the parsing and finding
                   values for the entire class. This makes this class slow, so try
                   not to re-construct it too many times."""

                self.__par2_filename = par2_filename
                self.__good_par2_files = []
                self.__corrupt_par2_files = []
                self.__protected_files = []
                self.__possible_files = []
                self.__set_basename = ''
                self.__escaped_basename = ''

                self.__main_logic ()

        def get_par2_filename (self):
                """Get the main Par2 filename."""
                return self.__par2_filename

        def get_good_par2_files (self):
                """Get a list of good Par2 files in this set."""
                return self.__good_par2_files

        def get_corrupt_par2_files (self):
                """Get a list of corrupt Par2 files in this set."""
                return self.__corrupt_par2_files

        def get_protected_files (self):
                """Get a list of filenames protected by the Par2 set."""
                return self.__protected_files

        def get_possible_files (self):
                """Get a list of files that have a name similar to the main
                   Par2 filename."""
                return self.__possible_files

        def get_set_basename (self):
                """Get the base filename of the Par2 set.
                   
                   The base filename is the Par2 filename given, minus the
                   first filename extension.
                   
                   Example: For "test.par2", the base filename would be "test"."""
                return self.__set_basename

        def get_escaped_basename (self):
                """Get the escaped base filename of the Par2 set.
                   
                   This name is suitable for use in regular expressions where
                   you just want to match using the literal base name."""
                return self.__escaped_basename

        def __main_logic (self):
                """Fills in all of the private variables in the class.

                   This should be called before you use any of the getter
                   methods in the class."""
                ####################################################
                # 1. Find out if we're parsing a "vol" file or not
                ####################################################

                # Get the base filename of self.par2_filename
                if (re.search ('\.vol\d+\+\d+\.par2$', self.__par2_filename, re.IGNORECASE)):
                        self.__set_basename = os.path.splitext (self.__par2_filename)[0]
                        self.__set_basename = os.path.splitext (self.__set_basename)[0]
                else:
                        self.__set_basename = os.path.splitext (self.__par2_filename)[0]

                ####################################################
                # 2. re.escape the filename
                ####################################################
                self.__escaped_basename = re.escape (self.__set_basename)

                ####################################################
                # 3. use the escaped filename to find all other files in the current set
                #    a. should be good for .000, .001, .r00, .rar
                #    b. should also find all par2 files
                ####################################################
                regex = re.compile ('^%s' % (self.__escaped_basename, ))
                self.__possible_files = [f for f in os.listdir(os.getcwd()) if regex.match (f)]

                ####################################################
                # 4. Parse all par2 files
                #    a. add to the good_par2_files list if it is good
                #    b. add to the corrupt_par2_files list if it is corrupt
                ####################################################
                regex = re.compile ('\.par2$', re.IGNORECASE)
                for f in self.__possible_files:
                        if regex.search (f):

                                # Try to parse the par2 file
                                try:
                                        filenames = self.__parse_par2_file (f)
                                        self.__good_par2_files.append (f)
                                except:
                                        self.__corrupt_par2_files.append (f)

                ####################################################
                # 5. Parse good_par2_files[0], if it exists
                #    a. if it doesn't exist, we can't really parse any of them
                #       so return what we've got
                ####################################################
                if len(self.__good_par2_files) > 0:
                        self.__good_par2_files.sort()
                        f = self.__good_par2_files[0]
                        self.__par2_filename = self.__good_par2_files[0]
                        self.__protected_files = self.__parse_par2_file (f)

        def __chompnulls (self, line):
                """Returns the line up to the first null character"""
                p = line.find('\0')

                if p < 0:
                        return line
                else:
                        return line[:p]

        def __parse_par2_file (self, filename):
                """Get all of the filenames that are protected by the par2
                file given as the filename"""

                try:
                        file = open(filename, 'rb')
                except:
                        print 'Could not open %s' % (filename, )
                        return []

                # We always want to do crc checks
                docrcchecks = True

                pkt_header_fmt = '< 8s Q 16s 16s 16s'
                pkt_header_size = struct.calcsize(pkt_header_fmt)
                file_pkt_fmt = '< 16s 16s 16s Q'
                file_pkt_size = struct.calcsize(file_pkt_fmt)
                main_pkt_fmt = '< Q I'
                main_pkt_size = struct.calcsize(main_pkt_fmt)

                seen_file_ids = {}
                expected_file_ids = None
                filenames = []

                while 1:
                        d = file.read(pkt_header_size)
                        if not d:
                                break

                        magic, pkt_len, pkt_md5, set_id, pkt_type = struct.unpack(pkt_header_fmt, d)

                        if docrcchecks:
                                import md5
                                control_md5 = md5.new()
                                control_md5.update(d[0x20:])
                                d = file.read(pkt_len - pkt_header_size)
                                control_md5.update(d)

                                if control_md5.digest() != pkt_md5:
                                        raise EnvironmentError, (errno.EINVAL, \
                                                "corrupt par2 file - bad packet hash")

                        if pkt_type == 'PAR 2.0\0FileDesc':
                                if not docrcchecks:
                                        d = file.read(pkt_len - pkt_header_size)

                                file_id, file_md5, file_md5_16k, file_size = \
                                        struct.unpack(file_pkt_fmt, d[:file_pkt_size])

                                if seen_file_ids.get(file_id) is None:
                                        seen_file_ids[file_id] = 1
                                        filename = self.__chompnulls(d[file_pkt_size:])
                                        filenames.append(filename)

                        elif pkt_type == "PAR 2.0\0Main\0\0\0\0":
                                if not docrcchecks:
                                        d = file.read(pkt_len - pkt_header_size)

                                if expected_file_ids is None:
                                        expected_file_ids = []
                                        slice_size, num_files = struct.unpack(main_pkt_fmt, d[:main_pkt_size])
                                        num_nonrecovery = (len(d)-main_pkt_size)/16 - num_files

                                        for i in range(main_pkt_size,main_pkt_size+(num_files+num_nonrecovery)*16,16):
                                                expected_file_ids.append(d[i:i+16])

                        else:
                                if not docrcchecks:
                                        file.seek(pkt_len - pkt_header_size, 1)

                if expected_file_ids is None:
                        raise EnvironmentError, (errno.EINVAL, \
                                "corrupt or unsupported par2 file - no main packet found")

                for id in expected_file_ids:
                        if not seen_file_ids.has_key(id):
                                raise EnvironmentError, (errno.EINVAL, \
                                        "corrupt or unsupported par2 file - " \
                                        "expected file description packet not found")

                return filenames

if __name__ == '__main__':

        fname = raw_input ("Enter PAR2 Filename to test: ")
        fname = os.path.abspath (os.path.expanduser (fname))

        while not os.path.isfile (fname):
                print "not a file, try again!"
                fname = raw_input ("Enter PAR2 Filename to test: ")
                fname = os.path.abspath (os.path.expanduser (fname))


        os.chdir(os.path.dirname(fname))
        fname = os.path.basename (fname)

        p = Par2Parser (fname)

        print "par2_filename:", p.get_par2_filename()
        print "good_par2_files:", p.get_good_par2_files()
        print "corrupt_par2_files:", p.get_corrupt_par2_files()
        print "protected_files:", p.get_protected_files()
        print "possible_files:", p.get_possible_files()
        print "set_basename:", p.get_set_basename()
        print "escaped_basename:", p.get_escaped_basename()