# -*- coding: utf-8 # $Id$ # Copyright 2001, 2002 by IVA Team and contributors-2002 # # This file is part of IVA. # # IVA is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # IVA is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with IVA; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA __version__ = "$Revision$"[11:-2] import string import re from types import UnicodeType def is_valid_url(url): """Check if given URL is valid.""" # FIXME: Add some more strict checks here... return is_valid_title(url) and re.search('^[a-z]+\:[a-z/]+',url) # This should accept most reasonable one line human # readable string. Use something more strict for ids. def is_valid_title(name): """Check if given name is valid.""" if len(name) == 0: return 0 if name[0] in string.whitespace or name[-1] in string.whitespace: return 0 for c in name: if c in '<>\t\n\r\x0b\x0c': return 0 return 1 def is_valid_id(name): if not is_valid_title(name): return 0 allowed = "".join([chr(i) for i in range(ord('a'), ord('z')+1)]) + \ "".join([chr(i) for i in range(ord('A'), ord('Z')+1)]) + \ string.digits + '_-.' for c in name: if c not in allowed: return 0 return 1 def is_valid_date(dd, mm, yyyy): """Check if given date is valid.""" return 1 # legal_tags should be a list of element names without angle brackets, # e.g. ('p','b','br') def strip_tags(s, legal_tags=[]): """Return a string with almost all tags removed.""" # Append closing tags for all elements. tags = [x for x in legal_tags] + ['/'+x for x in legal_tags] # If no tags are allowed, we need to use something here # so that the regexp works correctly. if not tags: tags=['___'] else: # Convert legal upper case tags to lower case. (We don't # care about illegal tags as they are thrown away anyway...) for tag in tags: s = re.sub('<'+string.upper(tag), '<'+tag, s) # Remove all tags that do not contain an opening or closing # version of the allowed tags. return re.sub("<(?!("+'|'.join(tags)+")).*?>","",s,re.M) def strip_non_xml(s,encoding='utf-8'): # Remove all characters not allowed in XML (mainly control characters) if type(s)==UnicodeType: us=s else: us=unicode(s,encoding) clean_s=u"" for c in us: if c in u'\u0009\u000a\u000d' or \ (c>=u'\u0020' and c<=u'\ud7ff') or \ (c>=u'\ue000' and c<=u'\ufffd') or \ (c>=u'\u10000' and c<=u'\u10ffff'): clean_s+=c if type(s)==UnicodeType: return clean_s return clean_s.encode(encoding) def strip_all(s): """Strip leading and tailing white space + all HTML tags (i.e. all stuff inside < and > characters (including those chars.))""" return strip_tags(s.strip()) from config import normal_entry_tags #= ['p','br','i','b','ul','ol','li'] from config import normal_entry_tags_and_link #= normal_entry_tags + ['a'] def render( text, do_strip=1, legal_tags=[], do_vertical_space=1, do_horizontal_space=0, ignore_whitespace_magic=('p','br') ): """Render given text with whitespace respect on, optionally removing all or some tags.""" if do_strip: text = strip_tags(text,legal_tags) ig_0 = ['<'+x for x in ignore_whitespace_magic] ignore = [x.upper() for x in ig_0] + ig_0 for magic in ignore: if text.find(magic)>-1: return text # According to Jukka Korpela # (http://www.cs.tut.fi/~jkorpela/HTML3.2/5.56.html): # 'It is recommended in the specifications that browsers # canonicalize line endings to CR, LF (ASCII decimal 13, 10) # when submitting the contents of the field. However, authors # should not rely on this, since not all browsers behave so.' # # So, let's be paranoid... # ... and normalize all line feeds to \n lf1 = re.compile('\r\n') lf2 = re.compile('\r') text = lf1.sub('\n',text) text = lf2.sub('\n',text) # Convert spaces and tabs to s at the beginning of # the paragraphs. (Makes some kind of indentation possible.) if do_horizontal_space: lines = [list(x) for x in text.split('\n')] for i in range(len(lines)): for j in range(len(lines[i])): if lines[i][j] == ' ': lines[i][j] = ' ' elif lines[i][j] == '\t': lines[i][j] = ' ' else: break lines[i] = ''.join(lines[i]) text = '\n'.join(lines) if do_vertical_space: vert1 = re.compile('\n\n') vert2 = re.compile('\n') text = vert1.sub("
",text)
text = vert2.sub("
",text)
text = "
"+text+"
" return text