#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2010 Matteo Castellini <self {at} mcastellini [dot] net>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

"""
slurparchive: batch convert from Mailman archives to mbox files.

usage: slurparchive url
"""

import sys
import os
import re
import urllib
import gzip


if __name__ == '__main__':
    # Exit if no url is supplied
    if len(sys.argv) < 2:
        print __doc__
        sys.exit(1)

    # Make sure the index url has a trailing slash
    url = sys.argv[1] if (sys.argv[1][-1] == '/') else (sys.argv[1] + '/')

    # Get the index.html file
    stream = urllib.urlopen(url)
    index = stream.read()
    stream.close()

    # Get the list of archive files
    gz_list = re.findall(r'\d{4,4}-[A-Z][a-z]*\.txt\.gz', index)

    for gz_name in gz_list:
        # Download the txt.gz file in /tmp and open them
        tmp = urllib.urlretrieve(url + gz_name)[0]
        gz_stream = gzip.open(tmp)
        # Create the mbox file replacing the @ inside the From fields
        mbox = open('%s.mbox' % gz_name.split('.')[0], 'w')
        for line in gz_stream:
            mbox.write(re.sub(r'^(From:? .*) at? ', r'\1@', line))
        # Close everything and remove the temporary file
        mbox.close()
        gz_stream.close()
        os.remove(tmp)
    
    sys.exit(0)

