#! /usr/bin/python2

# Copyright (c) 2012 Olivier Esser ( Firstname.Lastname@gmail.com )

# Permission is hereby granted, free of charge, to any person obtaining a copy of
# this software and associated documentation files (the "Software"), to deal in
# the Software without restriction, including without limitation the rights to
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
# the Software, and to permit persons to whom the Software is furnished to do so,
# subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


import sys,os,re,urllib,argparse,sets

file_domain_names="tlds-alpha-by-domain.txt"

try:
    domain_names=open(file_domain_names).readlines()
except (IOError,OSError):
    try:
        domain_names=open(os.path.join(os.path.dirname(os.path.realpath(sys.argv[0])),file_domain_names)).readlines()
    except (OSError,IOError):
        print>>sys.stderr, "Cannot open %s" % file_domain_names
        print>>sys.stderr, "You can download it from: http://www.icann.org/en/resources/registries/tlds"
        sys.exit(1)

parser=argparse.ArgumentParser(description="""
Convert an urlfilter.ini adblock list destinated to Opera into a Privoxy list.
You have to save the output in /etc/privoxy/fanboy.action (for example) and enable it in /etc/privoxy/config
(after the default.action but before the user.action, look at the actionsfile option).
The paths mentioned may vary according to your installation.
This script has only been tested with the Fanboy Adblock list for Opera
(available at: http://www.fanboy.co.nz/adblock/opera/). It wont't work as is with lists destinated to Adblock+.
""")
parser.add_argument('infile', help='The file to convert (can be an URL)')
parser.add_argument('-o', '--outfile', default=sys.stdout, type=argparse.FileType('w'), help="Output file (default standard output)")
parser.add_argument('-s', '--sectionname', default="opera-export", help="Name of the section to be used in the privoxy file (optional)")
parser.add_argument('-a', '--action', action="append", default=[], help="Additional action to be taken on the blocked patterns (can be specified multiple times). Typical use is -a +handle-as-image or -a +handle-as-empty-document")
args=parser.parse_args()

domain_names=sets.Set(["."+s.strip().lower() for s in domain_names if (not s.startswith("#")) and s.strip()])

def split_domain_path(pattern):
    """Split a pattern into a domain and path. This uses heuristic, it can't be garanteed 100% accurate"""
    protocol=re.match(r"^([a-zA-Z*]+://)",pattern)
    if protocol: pattern=pattern[len(protocol.group(1)):]
    split_pattern=pattern.split("/",1)
    if len(split_pattern)==1: split_pattern.append("")
    if re.match(r".*\.\d+\.\d+\.\d+\.\d+$",split_pattern[0]):
        return tuple(split_pattern)
    domain=re.match(r".*(\.[^.]+)$",split_pattern[0])
    if domain and domain.group(1).lower() in domain_names:
        return tuple(split_pattern)
    if not(pattern.startswith("*")):
        if "*" in split_pattern[0]:
            split_pattern=pattern.split("*",1)
            split_pattern[0]+='*'
            split_pattern[1]='*'+split_pattern[1]
            return tuple(split_pattern)
        else:
            return tuple(split_pattern)
    return ("",pattern)

def translate_pattern_domain_path(domain,path):
    """Given a domain and a path, using the opera urlfilter syntax, return a pattern for privoxy"""
    path_result=""
    if path.endswith("*") or path=="":
        final_character=""
        path=path[:-1]
    else:
        final_character=r"$"
    for char in path:
        if char==r"*":
            path_result+=r".*"
        else:
            path_result+=re.escape(char)
    return domain+"/"+path_result+final_character

def translate_pattern(pattern):
    return translate_pattern_domain_path(*split_domain_path(pattern))

try:
    opera_list=urllib.urlopen(args.infile).readlines()
except (IOError,OSError):
    print>>sys.stderr, "Cannot open %s " % args.infile
    sys.exit(1)

iter_pattern=iter(opera_list)

for pattern in iter_pattern:
    pattern=pattern.strip()
    if pattern.startswith("[exclude]"):
        break

head=r"{+block{%s}" % args.sectionname
for p in args.action:
    head+=" "+p
head+="}"
print>>args.outfile, head

for pattern in iter_pattern:
    pattern=pattern.strip()
    if pattern.startswith(";") or pattern=="": continue
    print>>args.outfile, translate_pattern(pattern)