#!/usr/bin/env python

# Copyright (c) 2010 Gregor Maier <gregor@majordomus.org>
# Released under the modified BSD-license and GPL v2.0.
# see COPYING for details
#  
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

import sys;
import inspect;

import gen_registered_domain_template
from gen_registered_domain_template import Node;

""" 
This is a script for generating python scripts that can find the
"effective TLD" (or "registered domain", or "public suffix") of domain
or host name.

We utilize Mozilla's public suffix list: http://publicsuffix.org,

Some examples: 
   DNS name       "registered domain"   "effective TLD"
   foo.bar.com         bar                  com
   foo.bar.co.uk       bar                  co.uk

The public suffix list is a text file. Ideally we would want to have a script
(or module) that has the list "embedded" in its logic, so that we don't have
to read and parse the suffix list file every time the script is run. 

This script does the that: 
It can be used to generate a script/module that has the logic embedded. In
this use-case one feeds an effective_tld_names.dat into this script on stdin
and gets the generated script on stdout.  (the effective_tld_names.dat files
can be found on http://publicsuffix.org) e.g.,  

./gen_registered_domain.py < effective_tld_names.dat > registered_domain.py

"""



def build_tree(fp): 
    root = Node();
    root.hasstar = True;
    root.children = dict();
    copyright = "";
    inlicenseblock = True;

    for line in fp:
        line = line.strip();
        # ingore comment lines and empty lines
        if line.startswith("//"):
            if inlicenseblock:
                copyright += line.replace("//","#",1) + "\n";
            if line.find("END LICENSE BLOCK") != -1: 
                inlicenseblock = False;
            continue;  
        if line=="":
            continue;
        # spec says: everything after first whitespace is ignored:
        line = line.split()[0];

        # check if its an exclusion rule
        if line.startswith('!'):
            exclude = True;
            line = line[1:];
        else:
            exclude = False;

        # re-encode the utf-8 encoded domain names
        # enode as punycode domain names. 
        line = unicode(line, 'utf-8').encode('idna')

        line = line.lower();
        
        # split the rules in parts
        parts = line.split('.');
        parts.reverse()

        # Start at root, traverse tree adding nodes if necessary.
        curnode = root;
        for curpart in parts:
            if curpart=="*":
                # It's a wildcard. There should not be any further parts. 
                # But just in case there are, we break the loop
                curnode.hasstar = True;
                break;
            if not curnode.children:
                curnode.children = dict();
            if curpart not in curnode.children:
                curnode.children[curpart] = Node();
            curnode = curnode.children[curpart];
        # we are done now. Set exclude if necessary, but only on the leaf!
        curnode.exclude = exclude;
    return (root, copyright);

if __name__ == "__main__":
    # Read the rules file from stdin and build the
    # rule tree.
    (root, copyright) = build_tree(sys.stdin);
    # get a eval()'able string representation of the tree
    tree_as_string = root.format()

    print "#!/usr/bin/env python"

    print gen_registered_domain_template.copyright;
    print copyright;

    print inspect.getsource(gen_registered_domain_template);

    # add the actual tree to the end of the generated file
    print "publicSuffixRuleTree = ",
    print tree_as_string

    print """
if __name__ == "__main__":
    main();
"""


