# -*- coding: utf-8
# $Id$

# Copyright 2001, 2002 by IVA Team and contributors-2002
#
# This file is part of IVA.
#
# IVA is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# IVA is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with IVA; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

__version__ = "$Revision$"[11:-2]

import string
import re
from types import UnicodeType

def is_valid_url(url):
    """Check if given URL is valid."""
    # FIXME: Add some more strict checks here...
    return is_valid_title(url) and re.search('^[a-z]+\:[a-z/]+',url)

# This should accept most reasonable one line human
# readable string. Use something more strict for ids.
def is_valid_title(name):
    """Check if given name is valid."""

    if len(name) == 0:
        return 0
    if name[0] in string.whitespace or name[-1] in string.whitespace:
        return 0
    for c in name:
        if c in '<>\t\n\r\x0b\x0c':
            return 0
    return 1

def is_valid_id(name):
    if not is_valid_title(name):
        return 0
    allowed = "".join([chr(i) for i in range(ord('a'), ord('z')+1)]) + \
              "".join([chr(i) for i in range(ord('A'), ord('Z')+1)]) + \
              string.digits + '_-.'
    for c in name:
        if c not in allowed:
            return 0
    return 1

def is_valid_date(dd, mm, yyyy):
    """Check if given date is valid."""
    return 1

# legal_tags should be a list of element names without angle brackets,
# e.g. ('p','b','br')
def strip_tags(s, legal_tags=[]):
    """Return a string with almost all tags removed."""

    # Append closing tags for all elements.
    tags = [x for x in legal_tags] + ['/'+x for x in legal_tags]

    # If no tags are allowed, we need to use something here
    # so that the regexp works correctly.
    if not tags:
        tags=['___']
    else:
        # Convert legal upper case tags to lower case. (We don't
        # care about illegal tags as they are thrown away anyway...)
        for tag in tags:
            s = re.sub('<'+string.upper(tag), '<'+tag, s)

    # Remove all tags that do not contain an opening or closing
    # version of the allowed tags.
    return re.sub("<(?!("+'|'.join(tags)+")).*?>","",s,re.M)

def strip_non_xml(s,encoding='utf-8'):
    # Remove all characters not allowed in XML (mainly control characters)
    if type(s)==UnicodeType:
        us=s
    else:
        us=unicode(s,encoding)
    clean_s=u""
    for c in us:
        if c in u'\u0009\u000a\u000d' or \
           (c>=u'\u0020' and c<=u'\ud7ff') or \
           (c>=u'\ue000' and c<=u'\ufffd') or \
           (c>=u'\u10000' and c<=u'\u10ffff'):
            clean_s+=c
    if type(s)==UnicodeType:
        return clean_s
    return clean_s.encode(encoding)

def strip_all(s):
    """Strip leading and tailing white space + all HTML tags (i.e.
    all stuff inside < and > characters (including those chars.))"""
    return strip_tags(s.strip())

from config import normal_entry_tags #= ['p','br','i','b','ul','ol','li']
from config import normal_entry_tags_and_link #= normal_entry_tags + ['a']

def render(
    text,
    do_strip=1,
    legal_tags=[],
    do_vertical_space=1,
    do_horizontal_space=0,
    ignore_whitespace_magic=('p','br')
    ):
    """Render given text with whitespace respect on, optionally
    removing all or some tags."""

    if do_strip:
        text = strip_tags(text,legal_tags)

    ig_0 = ['<'+x for x in ignore_whitespace_magic]
    ignore = [x.upper() for x in ig_0] + ig_0

    for magic in ignore:
        if text.find(magic)>-1:
            return text

    # According to Jukka Korpela
    # (http://www.cs.tut.fi/~jkorpela/HTML3.2/5.56.html):
    # 'It is recommended in the specifications that browsers
    # canonicalize line endings to CR, LF (ASCII decimal 13, 10)
    # when submitting the contents of the field. However, authors
    # should not rely on this, since not all browsers behave so.'
    #
    # So, let's be paranoid...
    # ... and normalize all line feeds to \n
    lf1 = re.compile('\r\n')
    lf2 = re.compile('\r')
    text = lf1.sub('\n',text)
    text = lf2.sub('\n',text)

    # Convert spaces and tabs to &nbsp;s at the beginning of
    # the paragraphs. (Makes some kind of indentation possible.)
    if do_horizontal_space:
        lines = [list(x) for x in text.split('\n')]
        for i in range(len(lines)):
            for j in range(len(lines[i])):
                if lines[i][j] == ' ':
                    lines[i][j] = '&nbsp;'
                elif lines[i][j] == '\t':
                    lines[i][j] = '&nbsp;&nbsp;&nbsp;&nbsp;'
                else:
                    break
            lines[i] = ''.join(lines[i])

        text = '\n'.join(lines)

    if do_vertical_space:
        vert1 = re.compile('\n\n')
        vert2 = re.compile('\n')
        text = vert1.sub("</p><p>",text)
        text = vert2.sub("<br />",text)
        text = "<p>"+text+"</p>"

    return text