# -*- coding: utf-8
# Copyright 2004-2006 by Vahur Rebas

import re
import Globals
from Globals import Acquisition, Persistent
from AccessControl import ClassSecurityInfo
from Acquisition import aq_base, aq_inner, aq_parent, Implicit
from OFS.SimpleItem import SimpleItem
from OFS.PropertyManager import PropertyManager
from zope.interface import implements

import xml.dom.minidom
from xml.dom.minidom import Document
from xml.dom import Node
#from xml.dom.ext import PrettyPrint
#from Products.ZCatalog.CatalogAwareness import CatalogAware
from Products.ZCatalog.CatalogPathAwareness import CatalogAware

from textindexng.interfaces import IIndexableContent
from textindexng.interfaces import IConverter, ISplitter
from textindexng.converters import html, sgml, ooffice, pdf
from textindexng.converters.entities import convert_entities
import types

from interfaces import IDocument
from permissions import *
from schemas import get_schema

class Document(Persistent,SimpleItem,CatalogAware, PropertyManager):
    """ One document """
    
    implements(IDocument, IIndexableContent)
    
    meta_type = 'Document'
    security = ClassSecurityInfo()
    #security.declareObjectProtected(perm_view,)
    security.declareObjectPublic()
    
    #XXX: this doesn't work!
    manage_options = PropertyManager.manage_options+SimpleItem.manage_options
    #manage_options=(
    #        {'label':'View',
    #        'action':'document_view'},
    #        )+SimpleItem.manage_options

    def __init__(self, _id, _doc, _schema):
        """ init """
        self.id = _id
        self.fulldoc = _doc
        self.default_catalog = 'zcatalog'
        
        self.org_fulldoc = ''                   # original full document. without marked text
        self.org_title = ''                     # original title        getOrgTitle
        self.org_description = ''               # original description  getOrgDescription
        self.org_body = ''                      # original body         getOrgBody

        self._setProperty('document_status', 0, 'int')       # getStatus
        self._setProperty('is_deleted', 0, 'boolean')
        
        self.title = ''             # getTitle
        self.description = ''       # getDescription
        self.body = ''              # getBody. 
        self._setProperty('textdoc', '', 'text')

        self._setProperty('words', 0, 'int')
        self._setProperty('sentences', 0, 'int')

        self._setProperty('lastModifier', '', 'string')

        # document schema
        self._document_schema = _schema
        self._corpus = ''
        self._dsstore = {}
        self._lemmated = False
        self._parsed = False

        self._analysis = ''

    def __getattr__(self, name):
        if name in get_schema(self.getMetaSchema()).getFieldNames():
            return get_schema(self.getMetaSchema()).getValue(name, self)
        raise AttributeError
        
    security.declarePrivate('manage_afterAdd')    
    def manage_afterAdd(self,obj,container):
        """ ... """
        print "manage_afterAdd"
        self.title = self.extractTitle()
        self.description = self.extractDescription()
        self.body = self.extractBody()
        self._updateProperty('textdoc', self.toText())
        #self.extractErrors()
        self.countWords()
        self.index_object()
        self._updateDocStatistics()
        self._storeDocInfos()
        #self.setGlobalUsedTongues(self.getTongue())
        #self.setGlobalUsedNations(self.getNation())
    
    security.declarePrivate('manage_beforeDelete')
    def manage_beforeDelete(self, item, container):
        """ unindex object before deletion """
        self.unindex_object()
        
    security.declareProtected(perm_view_document, 'getId')
    def getId(self):
        """ return id """
        return self.id

    security.declareProtected(perm_view_document, 'getTitle')
    def getTitle(self):
        """ return title """
        return self.title

    def getMetaSchema(self):
        return self._document_schema

    def getCorpus(self):
        return getattr(self, '_corpus', '')

    security.declareProtected(perm_manage, 'setCorpus')
    def setCorpus(self, cid):
        """ set a corpus id eg. assign a document to korpus """
        self._corpus = cid

    security.declareProtected(perm_view_document, 'getDescription')
    def getDescription(self):
        """ return description """
        return self.description
    
    security.declareProtected(perm_view_document, 'transformToView')
    def transformToView(self):
        """ return body """
        body = self.getBody()
        return body

    def isDeleted(self):
        return self.getProperty('is_deleted')
        
    security.declareProtected(perm_view_document, 'getBody')
    def getBody(self):
        """ return body """
        return self.body
    
    security.declareProtected(perm_view_document, 'indexableContent')
    def indexableContent(self, fields):
        from textindexng.content import IndexContentCollector as ICC
        icc = ICC()
        doc = unicode(self.fulldoc, 'utf-8')
        icc.addBinary('getDocument', doc, 'text/html')
        #icc.addContent('getMarkedWords', unicode(self.getMarkedWords(), 'utf-8'))
        marked = self.getMarkedWords()
        if type('') == type(marked):
            marked = unicode(marked, 'utf-8')
        icc.addContent('getMarkedWords', marked)
        icc.addContent('getTitle', unicode(self.getTitle(), 'utf-8'))
        return icc
    
    security.declareProtected(perm_view_document, 'getDocument')
    def getDocument(self):
        """ doc """
        self.REQUEST.RESPONSE.setHeader("Content-type","text/html; charset=UTF-8")
        return self.fulldoc

    security.declareProtected(perm_view_document, 'getStatus')
    def getStatus(self):
        """ return document status 
            0 - document is in editing mode
            1 - document is in marking mode
        """
        return self.getProperty('document_status')

    security.declareProtected(perm_view_document, 'getSubmitter')
    def getSubmitter(self):
        """ get the username who created document """
        return self.getOwnerTuple()[1]

    security.declareProtected(perm_view_document, 'getLastModder')
    def getLastModder(self):
        """ last modifier of the document """
        try:
            return self.lastModifier
        except AttributeError:
            return -1

    security.declareProtected(perm_change_status, 'start_modding')
    def start_modding(self, REQUEST):
        """ start modding document 
            we should copy fulldoc, body, title, desc etc to orginial document holding attributes
        """
        if not self.getStatus():
            self._updateProperty('document_status', 1)
            import copy
            self.org_fulldoc = copy.deepcopy(self.fulldoc)
            self.org_title = copy.deepcopy(self.title)
            self.org_description = copy.deepcopy(self.description)
            self.org_body = copy.deepcopy(self.body)
            self.reindex_object()
        return REQUEST.RESPONSE.redirect('document_mark')

    def isLemmated(self):
        return getattr(self, '_lemmated', False)

    def isParsed(self):
        return getattr(self, '_parsed', False)

    def getAnalysis(self):
        return getattr(self, '_analysis', "")
    
    security.declareProtected(perm_edit_document, 'extractTitle')
    def extractTitle(self):
        """ exract title from Document """
        doc = xml.dom.minidom.parseString(self.getDocument())
        title = doc.getElementsByTagName('title')[0]
        return get_text(title)
    
    security.declareProtected(perm_edit_document, 'extractDescription')
    def extractDescription(self):
        """ extract description from Document """
        doc = xml.dom.minidom.parseString(self.getDocument())
        desc = doc.getElementsByTagName('meta')
        for x in desc:
            if x.getAttribute('name') == 'description':
                return x.getAttribute('content')
        return ""
    
    security.declareProtected(perm_edit_document, 'extractBody')
    def extractBody(self):
        """ extract head from document """
        doc = xml.dom.minidom.parseString(self.getDocument())
        bodytag = doc.getElementsByTagName('body')[0]
        stri = bodytag.toxml()
        b = re.compile('(<body>)|(</body>)', re.I)
        stri = stri.encode('utf-8')
        res = b.sub('', stri)
        res = unicode(res, 'utf-8').encode('utf-8')
        return res
    
    security.declareProtected(perm_edit_document, 'saveDocument')
    def saveDocument(self,REQUEST):
        """ save modified document """
        self.fulldoc = REQUEST.kupu
        self.manage_afterAdd(self,self.aq_parent)
        try:
            self._updateProperty('lastModifier', str(REQUEST.AUTHENTICATED_USER))
        except:
            self._setProperty('lastModifier', str(REQUEST.AUTHENTICATED_USER), 'string')
        return REQUEST.RESPONSE.redirect(self.absolute_url())

    security.declareProtected(perm_view_document, 'getMarks')
    def getMarks(self, REQUEST):
        """ return marks. for marking page """
        return self.Errors.getDocumentMarksHTML(self.getId(), str(REQUEST.AUTHENTICATED_USER))

    security.declareProtected('do_not_touch_me', 'saveMarksNG_rescue')
    def saveMarksNG_rescue(self, REQUEST):
        """ saving mark pointers """
        count = 0
        from xml import xpath
        # determine how many pointer we have
        coun = 1000
        for x in range(1,coun):
            pair = REQUEST.get('range_'+str(x), None)
            if not pair:
                continue
            pair = eval(pair)
            print "-->", x, pair
            r = pair[0]
            pointer = pair[0]
            try:
                code = pair[1]
            except IndexError:
                print "index error 1", pair
                continue
            existing = None
            try:
                existing = pair[2]
            except IndexError:
                pass
            if existing:
                if existing == 'DELETEME':
                    continue
                if 'DELETEME' in pair:
                    self.Errors.deleteError(existing)
                print "existing...", pair
                continue
            r_st, r_en = r.split(';')
            doc = xml.dom.minidom.parseString(self.getDocument())
            start, st_offset = r_st.split('#')
            st_offset = st_offset.split(':')[1]
            end, en_offset = r_en.split('#')
            en_offset = en_offset.split(':')[1]
            body = doc.getElementsByTagName('body')[0]
            start = start.lower()
            end = end.lower()
            if start.startswith('/'): start = start[1:]
            if end.startswith('/'): end = end[1:]

            print "....", x, start
            start_node = xpath.Evaluate(start, body)[0] # always take the
            end_node = xpath.Evaluate(end, body)[0]     # first node we get
            if start_node.isSameNode(end_node):
                content = start_node.nodeValue[int(st_offset):int(en_offset)]
            else:
                content = self.extractMarkedContent(start_node, end_node, int(st_offset), int(en_offset))
            author = REQUEST.get('username')
            if type(content) == types.UnicodeType:
                content = content.encode('utf-8')
            err = self.Errors.addNewError(pointer, content, self.getId(), code, author)
            pre, post = self.extractContext(body, start_node, end_node, int(st_offset), int(en_offset), start=1)
            pre.reverse()
            err.addPreContext(pre)
            err.addPostContext(post)

        self.reindex_object()
        return REQUEST.RESPONSE.redirect(self.absolute_url())

    security.declareProtected(perm_mark_document, 'saveMarksNG')
    def saveMarksNG(self, REQUEST):
        """ saving mark pointers """
        count = 0
        from xml import xpath
        # determine how many pointer we have
        coun = 1000
        for x in range(1,coun):
            pair = REQUEST.get('range_'+str(x), None)
            if not pair:
                continue
            r = pair[0]
            pointer = pair[0]
            try:
                code = pair[1]
            except IndexError:
                continue
            existing = None
            try:
                existing = pair[2]
            except IndexError:
                pass
            if existing:
                if existing == 'DELETEME':
                    continue
                if 'DELETEME' in pair:
                    self.Errors.deleteError(existing)
                continue
            r_st, r_en = r.split(';')
            doc = xml.dom.minidom.parseString(self.getDocument())
            start, st_offset = r_st.split('#')
            st_offset = st_offset.split(':')[1]
            end, en_offset = r_en.split('#')
            en_offset = en_offset.split(':')[1]
            body = doc.getElementsByTagName('body')[0]
            start = start.lower()
            end = end.lower()
            if start.startswith('/'): start = start[1:]
            if end.startswith('/'): end = end[1:]
            
            print self.getDocument()
            print body
            print start
            try:
                start_node = xpath.Evaluate(start, body)[0] # always take the
            except IndexError, ie:
                # <body><span lang="ET" />
                # this is causing bug!
                # start == span/p[8]/span/text()
                # should be p[8]/span/text()
                raise ie
            end_node = xpath.Evaluate(end, body)[0]     # first node we get
            if start_node.isSameNode(end_node):
                content = start_node.nodeValue[int(st_offset):int(en_offset)]
            else:
                content = self.extractMarkedContent(start_node, end_node, int(st_offset), int(en_offset))
            author = str(REQUEST.AUTHENTICATED_USER)
            if type(content) == types.UnicodeType:
                content = content.encode('utf-8')
            err = self.Errors.addNewError(pointer, content, self.getId(), code, author)
            pre, post = self.extractContext(body, start_node, end_node, int(st_offset), int(en_offset), start=1)
            pre.reverse()
            err.addPreContext(pre)
            err.addPostContext(post)
                                            
        self.reindex_object()
        return REQUEST.RESPONSE.redirect(self.absolute_url())

    security.declareProtected(perm_edit_document, 'extractContext')
    def extractContext(self, body, start_node, end_node, start_offset, end_offset, 
            start = 0, res = None, seen_start=0, seen_end=0, pre='', post='', node=None):
        """
        returning n contexts of different sizes 
        ['sentences', 'from', 'closes', 'to', 'five'] #precontent
        ['sentences', 'from', 'closes', 'to', 'five'] #postcontent
        """
        if start:
            res = []
            pre = post = ''
            seen_start = seen_end = 0
            node = body
        # do stuff
        for x in node.childNodes:
            done_this = 0
            if x.isSameNode(start_node): seen_start = 1
            if x.isSameNode(end_node): seen_end = 1
            if x.isSameNode(start_node):
                pre += x.nodeValue[:start_offset]
                done_this = 1
            if x.isSameNode(end_node):
                post += x.nodeValue[end_offset:]
                done_this = 1
            if not seen_start and not seen_end and not done_this:
                if x.nodeValue is None: pre += ' '
                else: pre += x.nodeValue
            if not done_this and seen_start and not seen_end:
                # huh? it doesn't interest us?
                pass
            if seen_end and not done_this:
                if x.nodeValue is None: post += ' '
                else: post += x.nodeValue
            if x.hasChildNodes():
                seen_start, seen_end, pre, post = self.extractContext(body, start_node, end_node, start_offset, end_offset, 0, res, seen_start, seen_end, pre, post, x)
                
        if start:
            # do postprocessing
            i = len(pre)
            buff = ''
            pre_sent = []
            while i>0:
                i += -1
                if pre[i] == '.' or pre[i] == '!' or pre[i] == '?':
                    pre_sent.append(buff)
                    buff = ''
                buff = pre[i] + buff
            pre_sent = pre_sent[:5]
            i = 0
            buff = ''
            post_sent = []
            while i<len(post):
                buff += post[i]
                if post[i] == '.' or post[i] == '!' or post[i] == '?':
                    post_sent.append(buff)
                    buff = ''
                i += 1
            post_sent = post_sent[:5]
            return pre_sent, post_sent
        return seen_start, seen_end, pre, post

    security.declareProtected(perm_edit_document, 'extractMarkedContent')
    def extractMarkedContent(self, start_node, end_node, start_offset, end_offset):
        """ get content """
        result = ''
        start_tree = []
        end_tree = []
        runner = start_node
        while runner.parentNode:
            start_tree.append(runner)
            runner = runner.parentNode
        runner = end_node
        while runner.parentNode:
            end_tree.append(runner)
            runner = runner.parentNode
        start_tree.reverse()
        end_tree.reverse()
        length = len(start_tree)
        if len(end_tree)<length:
            length = len(end_tree)
        commonParent = None
        for x in range(length):
            if not start_tree[x].isSameNode(end_tree[x]):
                commonParent = start_tree[x-1]
                break
        result, record = self.walker(commonParent, start_node, end_node, start_offset, end_offset)
        return result

    security.declareProtected(perm_edit_document, 'walker')
    def walker(self, node, start_node, end_node, start_offset, end_offset, record=0, level=0):
        """ get node after node """
        result = ''
        for x in node.childNodes:
            if x.nodeType == 3:
                do_this_round = 1
                if start_node.isSameNode(x):
                    result += x.nodeValue[int(start_offset):]
                    record = 1
                    do_this_round = 0
                elif end_node.isSameNode(x):
                    result += x.nodeValue[:int(end_offset)]
                    record = 0
                    do_this_round = 0
                if record and do_this_round:
                    result += x.nodeValue
            if x.hasChildNodes():
                res = self.walker(x, start_node, end_node, start_offset, end_offset, record, level+1)
                result += res[0]
                record = res[1]
        return [result, record]

    security.declareProtected(perm_mark_document, 'useThis')
    def useThis(self):
        """ user will start to use this document for his own personal study """
        return 1

    security.declareProtected(perm_edit_document, 'toText')
    def toText(self):
        """ convert html string to text """
        C = html.Converter()
        a = C.convert(unicode(self.getDocument(),'utf-8'), 'utf-8')
        return a[0]

    security.declareProtected(perm_view_document, 'get_n_of_errors')
    def get_n_of_errors(self):
        codes = self.Errors.getDocumentMarks(self.getId())
        return len(codes)

    security.declareProtected(perm_view_document, 'get_n_of_diff_errors')
    def get_n_of_diff_errors(self):
        res = []
        for x in self.Errors.getDocumentMarks(self.getId()):
            if x.getProperty('code') not in res: res.append(x.getProperty('code'))
        return len(res)

    security.declareProtected(perm_view_document, 'getMarkedWords')
    def getMarkedWords(self):
        codes = self.Errors.getDocumentMarks(self.getId())
        res = []
        for x in codes:
            res.append(x.getProperty('content'))
        return ','.join(res)

    security.declareProtected(perm_view_document, 'getUsedCodes')
    def getUsedCodes(self):
        """ for indexing """
        codes = self.Errors.getDocumentMarks(self.getId())
        res = []
        for x in codes:
            res.append(x.getProperty('code'))
        return res

    security.declareProtected(perm_view_document, 'prettyUsedCodes')
    def prettyUsedCodes(self, REQUEST):
        codes = self.Errors.getDocumentMarks(self.getId(), str(REQUEST.AUTHENTICATED_USER))
        res = []
        for x in codes:
            y = x.getProperty('code').encode('utf-8')
            xpinter = x.getProperty('pointer').encode('utf-8')
            res.append([y, xpinter, self.Marks.prettyCodeTitle(y)])
        return res

    security.declareProtected(perm_view_document, 'getInheritedCodes')
    def getInheritedCodes(self, REQUEST):
        """ return inherited codes. uniqs only """
        codes = self.Errors.getDocumentMarks(self.getId(), str(REQUEST.AUTHENTICATED_USER))
        res = []
        done = []
        for x in codes:
            y = x.getProperty('code')
            code = self.Marks.getCode(y)
            #print ">>",code, code.aq_parent, code.aq_parent.meta_type
            try:
                tmp = code.aq_parent
            except AttributeError:
                continue
            while tmp.meta_type == 'mark':
                if tmp not in done:                    
                    done.append(tmp)
                    res.append(tmp.getTitle())
                tmp = tmp.aq_parent
        return res

    security.declareProtected(perm_view_document, 'get_n_of_words')
    def get_n_of_words(self):
        """ words in document """
        return self.getProperty('words')

    security.declareProtected(perm_view_document, 'get_n_of_sentences')
    def get_n_of_sentences(self):
        """ number of sentences in document """
        return self.getProperty('sentences')

    security.declareProtected(perm_view_document, 'prettyDate')
    def prettyDate(self):
        """ pretty bobobase_mod.. time """
        time = self.bobobase_modification_time().strftime('%d-%m-%Y %H:%M')
        return time

    security.declareProtected(perm_view_document, 'rawDate')
    def rawDate(self):
        """ raw date for indexing """
        return self.bobobase_modification_time().ISO()

    security.declareProtected(perm_edit_document, 'countWords')
    def countWords(self):
        """ count words and sentences """
        sentences, words = 0, 0
        doc = self.getProperty('textdoc')
        sentences += doc.count('.') + doc.count('!') + doc.count('?')
        tempwords = doc.split()
        words += len(tempwords)
        self._updateProperty('words', words)
        self._updateProperty('sentences', sentences)
        return
        
    security.declareProtected(perm_manage, 'convertErrorsNG')
    def convertErrorsNG(self, start=1, doc=None, node=None, level=0, xnodes = []):
        """ convert errors """
        if start:
            doc = xml.dom.minidom.parseString(self.getDocument())
            node = doc.firstChild
            node = node.nextSibling
            #PrettyPrint(doc)
            xnodes = []
        childs = node.childNodes
        dont_decr = 0
        c = 0
        c_tot = len(childs)
        while c<len(childs):
            x = childs[c]
            #print level*' ',"<"+x.nodeName+">"
            #if x.parentNode.nodeName=='error':
            #    print level*' ', x.nodeValue
            #elif x.nodeType == Node.TEXT_NODE:
            #    nodeval = x.nodeValue
            #    print level*' ', nodeval.encode('utf-8')
            if x.nodeName.lower() == 'error':
                #print "---------------------------------------- removing"
                ts = node.childNodes
                i = 0
                back = 0
                look_node = 0
                while i<len(ts):
                    #print i, ts[i].nodeName, ts[i].isSameNode(x)
                    if look_node and ts[i].nodeType == Node.TEXT_NODE:
                        #print "juhuu"
                        xpoint = self.mig_walker(ts[i])
                        #print xpoint, len(ts[i].nodeValue)
                        klass = x.getAttribute('class')
                        uniq = x.getAttribute('uniq')
                        code_id = x.getAttribute('id')
                        if back:
                            xnodes.append([xpoint, len(ts[i].nodeValue), klass, uniq, code_id])
                        else:
                            xnodes.append([xpoint, 0, klass, uniq, code_id])
                        break
                    if ts[i].isSameNode(x):
                        if ts[i].isSameNode(node.firstChild):
                            pass
                        else:
                            if not look_node:
                                back = 1
                        look_node = 1
                    if back: 
                        i -= 1
                        if i == -1:
                            i = 0
                            back = 0
                    else: 
                        i += 1
                node.removeChild(x)
                doc.normalize()
                dont_decr = 1
                continue
            if x.hasChildNodes and x.nodeName.lower() != 'error':
                self.convertErrorsNG(0, doc, x, level+4, xnodes)
            if not dont_decr:
                c += 1
            if dont_decr:
                dont_decr = 0
            #print level*' ',"</"+x.nodeName+">"
        if start:
            #PrettyPrint(doc)
            unis = {}
            for x in xnodes:
                if not unis.has_key(x[3]):
                    unis[x[3]] = {}
                unis[x[3]][x[2]] = x[1]
                unis[x[3]]['xp'] = x[0]
                unis[x[3]]['code_id'] = x[4]
                if x[2] == u'start':
                    unis[x[3]]['xp_start'] = x[0]
                else:
                    unis[x[3]]['xp_end'] = x[0]
            from xml import xpath
            body = doc.getElementsByTagName('body')[0]
            add_counter = 0
            for x in unis.keys():
                if not unis[x].has_key('start'): continue
                if not unis[x].has_key('end'): continue

                # get starting node
                xp = unis[x]['xp_start']
                if xp.startswith('/'): xp = xp[1:]
                start_node = xpath.Evaluate(xp, body)[0]
                
                # get ending node
                xp = unis[x]['xp_end']
                if xp.startswith('/'): xp = xp[1:]
                end_node = xpath.Evaluate(xp, body)[0]

                if start_node.isSameNode(end_node):
                    content = start_node.nodeValue[unis[x][u'start']:unis[x][u'end']]
                else:
                    content = self.extractMarkedContent(start_node, end_node, unis[x][u'start'], unis[x][u'end'])
                
                pointer = unis[x]['xp_start']+'#off:'+str(unis[x][u'start'])
                pointer += ';'
                pointer += unis[x]['xp_end']+'#off:'+str(unis[x][u'end'])
                tmp = unis[x]['code_id']
                try:
                    code = tmp.split('-')[1]
                except IndexError:
                    code = tmp
                author = code.split('_')[0]
                add_counter += 1
                content = content.encode('utf-8')
                err = self.Errors.addNewError(pointer, content, self.getId(), code, author)
                pre, post = self.extractContext(body, start_node, end_node, unis[x][u'start'], unis[x][u'end'], start=1)
                pre.reverse()
                err.addPreContext(pre)
                err.addPostContext(post)
            print "--------- TOTAL ADD:", add_counter
            bodytag = doc.getElementsByTagName('body')[0]
            stri = bodytag.toxml()
            b = re.compile('(<body>)|(</body>)', re.I)
            stri = stri.encode('utf-8')
            res = b.sub('', stri)
            res = unicode(res, 'utf-8').encode('utf-8')
            fd = doc.toxml().encode('utf-8')
            fd = re.sub('<\?xml version="1.0" \?>', '', fd)
            self.fulldoc = fd
            self.body = res
        return "ok"

    security.declareProtected(perm_manage, 'mig_walker')
    def mig_walker(self, node, depth=0):
        """ migration walker. generated xpath from error nodes """
        res = ""
        if node.parentNode and depth < 99 and node.parentNode.nodeName.lower() != 'body':
            res += self.mig_walker(node.parentNode, depth + 1)
        index = self.mig_siblingIndex(node)
        if node.nodeType == Node.ELEMENT_NODE:
            res += '/'+node.nodeName+'['+str(index)+']'
            #print "element node"
        elif node.nodeType == Node.DOCUMENT_NODE:
            print "document node"
            raise 'document node'
        elif node.nodeType == Node.TEXT_NODE:
            res += '/text()'
            if index > 1: res += '['+str(index)+']'
        return res

    security.declareProtected(perm_manage, 'mig_siblingIndex')
    def mig_siblingIndex(self, node):
        """ return sibling index """
        siblings = node.parentNode.childNodes
        count = 0
        res = 0
        if node.nodeType == Node.ELEMENT_NODE:
            name = node.nodeName
            for x in siblings:
                if x.nodeType == Node.ELEMENT_NODE:
                    if x.nodeName == name: 
                        count += 1
                    if x == node:
                        res = count
                        break
        elif node.nodeType == Node.TEXT_NODE:
            for x in siblings:
                if x.nodeType == Node.TEXT_NODE:
                    count += 1
                if x == node:
                    res = count
                    break
        return res
    
    security.declareProtected(perm_edit_document, 'get_uniq_words')
    def get_uniq_words(self):
        """ count words and sentences 
            TODO: 11)Antut -> 11Antut, wrong!
        """
        doc = self.textdoc3
        tempwords = doc.split()
        res = {}
        printed = []
        for wp in tempwords:
            org_org_word = wp
            split_sym = [')', '(', '/']
            word = [wp]
            for sym in split_sym:
                tmp = []
                for x in word:
                    tmp += x.split(sym)
                word = tmp
            for w in word:
                if not w: continue
                org_w = w
                for i in ['.', ',', '"', '(', ')', '`', '?', '!', '“', '*', ';', ':','”', '→', '•', '       ·', '„', ']', '[']:
                    w = w.replace(i, '')
                w = w.replace('`', '\'')
                w = w.lower()
                #if len(w) == 0:
                #    if self.getId() not in printed:
                #        print self.getId(), w, org_w
                #        printed.append(self.getId())
                for rem in ["´", "'", '-', '/']:
                    if w.startswith(rem): w = w[1:]
                    if w.endswith(rem): w = w[:-1]
                for rep in [['Õ', 'õ'], ['Ü', 'ü'], ['Ö', 'ö'], ['Ä', 'ä']]:
                    w = w.replace(rep[0], rep[1])
                if res.has_key(w): res[w] = res[w] + 1
                else: res[w] = 1
                if w == 'v': print 'V::', self.getId(), org_org_word
                if w.startswith('haal'): print "HAAL::", self.getId(), org_org_word
        return res

    def another_txt(self):
        import re
        p_start = re.compile('<P.*?>|<li.*?>', re.I)
        p_end = re.compile('</p>|</li>', re.I)
        all = re.compile('<.*?>', re.I)
        txt = unicode(self.getBody(), 'utf-8')
        txt = convert_entities(txt).encode('utf-8')
        txt2 = re.sub(p_start, ' \n', txt)
        txt = re.sub(p_end, ' \n', txt2)
        value = re.sub(all, '',txt)
        self.textdoc3 = value
        #if self.getId() == 'doc_491521501739_item':
        #    print "="*40
        #    print txt
        #    print "="*40
        #    print value

    def _updateDocStatistics(self):
        """ doctest """
        from zope.component import getUtility
        from textindexng.interfaces import IConverter, ISplitter

        c = getUtility(IConverter, 'text/html')
        su = getUtility(ISplitter, name="txng.splitters.simple")

        doc = unicode(self.fulldoc, 'utf-8')
        cvtr, encoding = c.convert(doc)
        cvt = unicode(cvtr, encoding)
        spl = su.split(cvt)
        self.sqls.deleteDocref(docid=self.getId())
        for s in spl:
            if not isinstance(s, unicode):
                raise 'not a unicode!'
            self.sqls.docsInsert(word=s, docid=self.getId(), language=getattr(self, 'docLanguage'), corpus=self.getCorpus())
        return

    def _storeDocInfos(self):
        self.sqls.storeDocument(
                docid=self.getId(), title=self.getTitle(), 
                language=getattr(self, 'docLanguage', 'x'), corpus=self.getCorpus())
      
def get_text(elem):
	""" used only to extract documents title... """
	res=u''
	for line in elem.childNodes:
		res += line.nodeValue
	return res.encode('utf-8')

Globals.InitializeClass(Document)