# -*- coding: utf-8 # Copyright 2004-2006 by Vahur Rebas import re import Globals from Globals import Acquisition, Persistent from AccessControl import ClassSecurityInfo from Acquisition import aq_base, aq_inner, aq_parent, Implicit from OFS.SimpleItem import SimpleItem from OFS.PropertyManager import PropertyManager from zope.interface import implements import xml.dom.minidom from xml.dom.minidom import Document from xml.dom import Node #from xml.dom.ext import PrettyPrint #from Products.ZCatalog.CatalogAwareness import CatalogAware from Products.ZCatalog.CatalogPathAwareness import CatalogAware from textindexng.interfaces import IIndexableContent from textindexng.interfaces import IConverter, ISplitter from textindexng.converters import html, sgml, ooffice, pdf from textindexng.converters.entities import convert_entities import types from interfaces import IDocument from permissions import * from schemas import get_schema class Document(Persistent,SimpleItem,CatalogAware, PropertyManager): """ One document """ implements(IDocument, IIndexableContent) meta_type = 'Document' security = ClassSecurityInfo() #security.declareObjectProtected(perm_view,) security.declareObjectPublic() #XXX: this doesn't work! manage_options = PropertyManager.manage_options+SimpleItem.manage_options #manage_options=( # {'label':'View', # 'action':'document_view'}, # )+SimpleItem.manage_options def __init__(self, _id, _doc, _schema): """ init """ self.id = _id self.fulldoc = _doc self.default_catalog = 'zcatalog' self.org_fulldoc = '' # original full document. without marked text self.org_title = '' # original title getOrgTitle self.org_description = '' # original description getOrgDescription self.org_body = '' # original body getOrgBody self._setProperty('document_status', 0, 'int') # getStatus self._setProperty('is_deleted', 0, 'boolean') self.title = '' # getTitle self.description = '' # getDescription self.body = '' # getBody. self._setProperty('textdoc', '', 'text') self._setProperty('words', 0, 'int') self._setProperty('sentences', 0, 'int') self._setProperty('lastModifier', '', 'string') # document schema self._document_schema = _schema self._corpus = '' self._dsstore = {} self._lemmated = False self._parsed = False self._analysis = '' def __getattr__(self, name): if name in get_schema(self.getMetaSchema()).getFieldNames(): return get_schema(self.getMetaSchema()).getValue(name, self) raise AttributeError security.declarePrivate('manage_afterAdd') def manage_afterAdd(self,obj,container): """ ... """ print "manage_afterAdd" self.title = self.extractTitle() self.description = self.extractDescription() self.body = self.extractBody() self._updateProperty('textdoc', self.toText()) #self.extractErrors() self.countWords() self.index_object() self._updateDocStatistics() self._storeDocInfos() #self.setGlobalUsedTongues(self.getTongue()) #self.setGlobalUsedNations(self.getNation()) security.declarePrivate('manage_beforeDelete') def manage_beforeDelete(self, item, container): """ unindex object before deletion """ self.unindex_object() security.declareProtected(perm_view_document, 'getId') def getId(self): """ return id """ return self.id security.declareProtected(perm_view_document, 'getTitle') def getTitle(self): """ return title """ return self.title def getMetaSchema(self): return self._document_schema def getCorpus(self): return getattr(self, '_corpus', '') security.declareProtected(perm_manage, 'setCorpus') def setCorpus(self, cid): """ set a corpus id eg. assign a document to korpus """ self._corpus = cid security.declareProtected(perm_view_document, 'getDescription') def getDescription(self): """ return description """ return self.description security.declareProtected(perm_view_document, 'transformToView') def transformToView(self): """ return body """ body = self.getBody() return body def isDeleted(self): return self.getProperty('is_deleted') security.declareProtected(perm_view_document, 'getBody') def getBody(self): """ return body """ return self.body security.declareProtected(perm_view_document, 'indexableContent') def indexableContent(self, fields): from textindexng.content import IndexContentCollector as ICC icc = ICC() doc = unicode(self.fulldoc, 'utf-8') icc.addBinary('getDocument', doc, 'text/html') #icc.addContent('getMarkedWords', unicode(self.getMarkedWords(), 'utf-8')) marked = self.getMarkedWords() if type('') == type(marked): marked = unicode(marked, 'utf-8') icc.addContent('getMarkedWords', marked) icc.addContent('getTitle', unicode(self.getTitle(), 'utf-8')) return icc security.declareProtected(perm_view_document, 'getDocument') def getDocument(self): """ doc """ self.REQUEST.RESPONSE.setHeader("Content-type","text/html; charset=UTF-8") return self.fulldoc security.declareProtected(perm_view_document, 'getStatus') def getStatus(self): """ return document status 0 - document is in editing mode 1 - document is in marking mode """ return self.getProperty('document_status') security.declareProtected(perm_view_document, 'getSubmitter') def getSubmitter(self): """ get the username who created document """ return self.getOwnerTuple()[1] security.declareProtected(perm_view_document, 'getLastModder') def getLastModder(self): """ last modifier of the document """ try: return self.lastModifier except AttributeError: return -1 security.declareProtected(perm_change_status, 'start_modding') def start_modding(self, REQUEST): """ start modding document we should copy fulldoc, body, title, desc etc to orginial document holding attributes """ if not self.getStatus(): self._updateProperty('document_status', 1) import copy self.org_fulldoc = copy.deepcopy(self.fulldoc) self.org_title = copy.deepcopy(self.title) self.org_description = copy.deepcopy(self.description) self.org_body = copy.deepcopy(self.body) self.reindex_object() return REQUEST.RESPONSE.redirect('document_mark') def isLemmated(self): return getattr(self, '_lemmated', False) def isParsed(self): return getattr(self, '_parsed', False) def getAnalysis(self): return getattr(self, '_analysis', "") security.declareProtected(perm_edit_document, 'extractTitle') def extractTitle(self): """ exract title from Document """ doc = xml.dom.minidom.parseString(self.getDocument()) title = doc.getElementsByTagName('title')[0] return get_text(title) security.declareProtected(perm_edit_document, 'extractDescription') def extractDescription(self): """ extract description from Document """ doc = xml.dom.minidom.parseString(self.getDocument()) desc = doc.getElementsByTagName('meta') for x in desc: if x.getAttribute('name') == 'description': return x.getAttribute('content') return "" security.declareProtected(perm_edit_document, 'extractBody') def extractBody(self): """ extract head from document """ doc = xml.dom.minidom.parseString(self.getDocument()) bodytag = doc.getElementsByTagName('body')[0] stri = bodytag.toxml() b = re.compile('()|()', re.I) stri = stri.encode('utf-8') res = b.sub('', stri) res = unicode(res, 'utf-8').encode('utf-8') return res security.declareProtected(perm_edit_document, 'saveDocument') def saveDocument(self,REQUEST): """ save modified document """ self.fulldoc = REQUEST.kupu self.manage_afterAdd(self,self.aq_parent) try: self._updateProperty('lastModifier', str(REQUEST.AUTHENTICATED_USER)) except: self._setProperty('lastModifier', str(REQUEST.AUTHENTICATED_USER), 'string') return REQUEST.RESPONSE.redirect(self.absolute_url()) security.declareProtected(perm_view_document, 'getMarks') def getMarks(self, REQUEST): """ return marks. for marking page """ return self.Errors.getDocumentMarksHTML(self.getId(), str(REQUEST.AUTHENTICATED_USER)) security.declareProtected('do_not_touch_me', 'saveMarksNG_rescue') def saveMarksNG_rescue(self, REQUEST): """ saving mark pointers """ count = 0 from xml import xpath # determine how many pointer we have coun = 1000 for x in range(1,coun): pair = REQUEST.get('range_'+str(x), None) if not pair: continue pair = eval(pair) print "-->", x, pair r = pair[0] pointer = pair[0] try: code = pair[1] except IndexError: print "index error 1", pair continue existing = None try: existing = pair[2] except IndexError: pass if existing: if existing == 'DELETEME': continue if 'DELETEME' in pair: self.Errors.deleteError(existing) print "existing...", pair continue r_st, r_en = r.split(';') doc = xml.dom.minidom.parseString(self.getDocument()) start, st_offset = r_st.split('#') st_offset = st_offset.split(':')[1] end, en_offset = r_en.split('#') en_offset = en_offset.split(':')[1] body = doc.getElementsByTagName('body')[0] start = start.lower() end = end.lower() if start.startswith('/'): start = start[1:] if end.startswith('/'): end = end[1:] print "....", x, start start_node = xpath.Evaluate(start, body)[0] # always take the end_node = xpath.Evaluate(end, body)[0] # first node we get if start_node.isSameNode(end_node): content = start_node.nodeValue[int(st_offset):int(en_offset)] else: content = self.extractMarkedContent(start_node, end_node, int(st_offset), int(en_offset)) author = REQUEST.get('username') if type(content) == types.UnicodeType: content = content.encode('utf-8') err = self.Errors.addNewError(pointer, content, self.getId(), code, author) pre, post = self.extractContext(body, start_node, end_node, int(st_offset), int(en_offset), start=1) pre.reverse() err.addPreContext(pre) err.addPostContext(post) self.reindex_object() return REQUEST.RESPONSE.redirect(self.absolute_url()) security.declareProtected(perm_mark_document, 'saveMarksNG') def saveMarksNG(self, REQUEST): """ saving mark pointers """ count = 0 from xml import xpath # determine how many pointer we have coun = 1000 for x in range(1,coun): pair = REQUEST.get('range_'+str(x), None) if not pair: continue r = pair[0] pointer = pair[0] try: code = pair[1] except IndexError: continue existing = None try: existing = pair[2] except IndexError: pass if existing: if existing == 'DELETEME': continue if 'DELETEME' in pair: self.Errors.deleteError(existing) continue r_st, r_en = r.split(';') doc = xml.dom.minidom.parseString(self.getDocument()) start, st_offset = r_st.split('#') st_offset = st_offset.split(':')[1] end, en_offset = r_en.split('#') en_offset = en_offset.split(':')[1] body = doc.getElementsByTagName('body')[0] start = start.lower() end = end.lower() if start.startswith('/'): start = start[1:] if end.startswith('/'): end = end[1:] print self.getDocument() print body print start try: start_node = xpath.Evaluate(start, body)[0] # always take the except IndexError, ie: # # this is causing bug! # start == span/p[8]/span/text() # should be p[8]/span/text() raise ie end_node = xpath.Evaluate(end, body)[0] # first node we get if start_node.isSameNode(end_node): content = start_node.nodeValue[int(st_offset):int(en_offset)] else: content = self.extractMarkedContent(start_node, end_node, int(st_offset), int(en_offset)) author = str(REQUEST.AUTHENTICATED_USER) if type(content) == types.UnicodeType: content = content.encode('utf-8') err = self.Errors.addNewError(pointer, content, self.getId(), code, author) pre, post = self.extractContext(body, start_node, end_node, int(st_offset), int(en_offset), start=1) pre.reverse() err.addPreContext(pre) err.addPostContext(post) self.reindex_object() return REQUEST.RESPONSE.redirect(self.absolute_url()) security.declareProtected(perm_edit_document, 'extractContext') def extractContext(self, body, start_node, end_node, start_offset, end_offset, start = 0, res = None, seen_start=0, seen_end=0, pre='', post='', node=None): """ returning n contexts of different sizes ['sentences', 'from', 'closes', 'to', 'five'] #precontent ['sentences', 'from', 'closes', 'to', 'five'] #postcontent """ if start: res = [] pre = post = '' seen_start = seen_end = 0 node = body # do stuff for x in node.childNodes: done_this = 0 if x.isSameNode(start_node): seen_start = 1 if x.isSameNode(end_node): seen_end = 1 if x.isSameNode(start_node): pre += x.nodeValue[:start_offset] done_this = 1 if x.isSameNode(end_node): post += x.nodeValue[end_offset:] done_this = 1 if not seen_start and not seen_end and not done_this: if x.nodeValue is None: pre += ' ' else: pre += x.nodeValue if not done_this and seen_start and not seen_end: # huh? it doesn't interest us? pass if seen_end and not done_this: if x.nodeValue is None: post += ' ' else: post += x.nodeValue if x.hasChildNodes(): seen_start, seen_end, pre, post = self.extractContext(body, start_node, end_node, start_offset, end_offset, 0, res, seen_start, seen_end, pre, post, x) if start: # do postprocessing i = len(pre) buff = '' pre_sent = [] while i>0: i += -1 if pre[i] == '.' or pre[i] == '!' or pre[i] == '?': pre_sent.append(buff) buff = '' buff = pre[i] + buff pre_sent = pre_sent[:5] i = 0 buff = '' post_sent = [] while i>",code, code.aq_parent, code.aq_parent.meta_type try: tmp = code.aq_parent except AttributeError: continue while tmp.meta_type == 'mark': if tmp not in done: done.append(tmp) res.append(tmp.getTitle()) tmp = tmp.aq_parent return res security.declareProtected(perm_view_document, 'get_n_of_words') def get_n_of_words(self): """ words in document """ return self.getProperty('words') security.declareProtected(perm_view_document, 'get_n_of_sentences') def get_n_of_sentences(self): """ number of sentences in document """ return self.getProperty('sentences') security.declareProtected(perm_view_document, 'prettyDate') def prettyDate(self): """ pretty bobobase_mod.. time """ time = self.bobobase_modification_time().strftime('%d-%m-%Y %H:%M') return time security.declareProtected(perm_view_document, 'rawDate') def rawDate(self): """ raw date for indexing """ return self.bobobase_modification_time().ISO() security.declareProtected(perm_edit_document, 'countWords') def countWords(self): """ count words and sentences """ sentences, words = 0, 0 doc = self.getProperty('textdoc') sentences += doc.count('.') + doc.count('!') + doc.count('?') tempwords = doc.split() words += len(tempwords) self._updateProperty('words', words) self._updateProperty('sentences', sentences) return security.declareProtected(perm_manage, 'convertErrorsNG') def convertErrorsNG(self, start=1, doc=None, node=None, level=0, xnodes = []): """ convert errors """ if start: doc = xml.dom.minidom.parseString(self.getDocument()) node = doc.firstChild node = node.nextSibling #PrettyPrint(doc) xnodes = [] childs = node.childNodes dont_decr = 0 c = 0 c_tot = len(childs) while c" #if x.parentNode.nodeName=='error': # print level*' ', x.nodeValue #elif x.nodeType == Node.TEXT_NODE: # nodeval = x.nodeValue # print level*' ', nodeval.encode('utf-8') if x.nodeName.lower() == 'error': #print "---------------------------------------- removing" ts = node.childNodes i = 0 back = 0 look_node = 0 while i" if start: #PrettyPrint(doc) unis = {} for x in xnodes: if not unis.has_key(x[3]): unis[x[3]] = {} unis[x[3]][x[2]] = x[1] unis[x[3]]['xp'] = x[0] unis[x[3]]['code_id'] = x[4] if x[2] == u'start': unis[x[3]]['xp_start'] = x[0] else: unis[x[3]]['xp_end'] = x[0] from xml import xpath body = doc.getElementsByTagName('body')[0] add_counter = 0 for x in unis.keys(): if not unis[x].has_key('start'): continue if not unis[x].has_key('end'): continue # get starting node xp = unis[x]['xp_start'] if xp.startswith('/'): xp = xp[1:] start_node = xpath.Evaluate(xp, body)[0] # get ending node xp = unis[x]['xp_end'] if xp.startswith('/'): xp = xp[1:] end_node = xpath.Evaluate(xp, body)[0] if start_node.isSameNode(end_node): content = start_node.nodeValue[unis[x][u'start']:unis[x][u'end']] else: content = self.extractMarkedContent(start_node, end_node, unis[x][u'start'], unis[x][u'end']) pointer = unis[x]['xp_start']+'#off:'+str(unis[x][u'start']) pointer += ';' pointer += unis[x]['xp_end']+'#off:'+str(unis[x][u'end']) tmp = unis[x]['code_id'] try: code = tmp.split('-')[1] except IndexError: code = tmp author = code.split('_')[0] add_counter += 1 content = content.encode('utf-8') err = self.Errors.addNewError(pointer, content, self.getId(), code, author) pre, post = self.extractContext(body, start_node, end_node, unis[x][u'start'], unis[x][u'end'], start=1) pre.reverse() err.addPreContext(pre) err.addPostContext(post) print "--------- TOTAL ADD:", add_counter bodytag = doc.getElementsByTagName('body')[0] stri = bodytag.toxml() b = re.compile('()|()', re.I) stri = stri.encode('utf-8') res = b.sub('', stri) res = unicode(res, 'utf-8').encode('utf-8') fd = doc.toxml().encode('utf-8') fd = re.sub('<\?xml version="1.0" \?>', '', fd) self.fulldoc = fd self.body = res return "ok" security.declareProtected(perm_manage, 'mig_walker') def mig_walker(self, node, depth=0): """ migration walker. generated xpath from error nodes """ res = "" if node.parentNode and depth < 99 and node.parentNode.nodeName.lower() != 'body': res += self.mig_walker(node.parentNode, depth + 1) index = self.mig_siblingIndex(node) if node.nodeType == Node.ELEMENT_NODE: res += '/'+node.nodeName+'['+str(index)+']' #print "element node" elif node.nodeType == Node.DOCUMENT_NODE: print "document node" raise 'document node' elif node.nodeType == Node.TEXT_NODE: res += '/text()' if index > 1: res += '['+str(index)+']' return res security.declareProtected(perm_manage, 'mig_siblingIndex') def mig_siblingIndex(self, node): """ return sibling index """ siblings = node.parentNode.childNodes count = 0 res = 0 if node.nodeType == Node.ELEMENT_NODE: name = node.nodeName for x in siblings: if x.nodeType == Node.ELEMENT_NODE: if x.nodeName == name: count += 1 if x == node: res = count break elif node.nodeType == Node.TEXT_NODE: for x in siblings: if x.nodeType == Node.TEXT_NODE: count += 1 if x == node: res = count break return res security.declareProtected(perm_edit_document, 'get_uniq_words') def get_uniq_words(self): """ count words and sentences TODO: 11)Antut -> 11Antut, wrong! """ doc = self.textdoc3 tempwords = doc.split() res = {} printed = [] for wp in tempwords: org_org_word = wp split_sym = [')', '(', '/'] word = [wp] for sym in split_sym: tmp = [] for x in word: tmp += x.split(sym) word = tmp for w in word: if not w: continue org_w = w for i in ['.', ',', '"', '(', ')', '`', '?', '!', '“', '*', ';', ':','”', '→', '•', ' ·', '„', ']', '[']: w = w.replace(i, '') w = w.replace('`', '\'') w = w.lower() #if len(w) == 0: # if self.getId() not in printed: # print self.getId(), w, org_w # printed.append(self.getId()) for rem in ["´", "'", '-', '/']: if w.startswith(rem): w = w[1:] if w.endswith(rem): w = w[:-1] for rep in [['Õ', 'õ'], ['Ü', 'ü'], ['Ö', 'ö'], ['Ä', 'ä']]: w = w.replace(rep[0], rep[1]) if res.has_key(w): res[w] = res[w] + 1 else: res[w] = 1 if w == 'v': print 'V::', self.getId(), org_org_word if w.startswith('haal'): print "HAAL::", self.getId(), org_org_word return res def another_txt(self): import re p_start = re.compile('|', re.I) p_end = re.compile('

|', re.I) all = re.compile('<.*?>', re.I) txt = unicode(self.getBody(), 'utf-8') txt = convert_entities(txt).encode('utf-8') txt2 = re.sub(p_start, ' \n', txt) txt = re.sub(p_end, ' \n', txt2) value = re.sub(all, '',txt) self.textdoc3 = value #if self.getId() == 'doc_491521501739_item': # print "="*40 # print txt # print "="*40 # print value def _updateDocStatistics(self): """ doctest """ from zope.component import getUtility from textindexng.interfaces import IConverter, ISplitter c = getUtility(IConverter, 'text/html') su = getUtility(ISplitter, name="txng.splitters.simple") doc = unicode(self.fulldoc, 'utf-8') cvtr, encoding = c.convert(doc) cvt = unicode(cvtr, encoding) spl = su.split(cvt) self.sqls.deleteDocref(docid=self.getId()) for s in spl: if not isinstance(s, unicode): raise 'not a unicode!' self.sqls.docsInsert(word=s, docid=self.getId(), language=getattr(self, 'docLanguage'), corpus=self.getCorpus()) return def _storeDocInfos(self): self.sqls.storeDocument( docid=self.getId(), title=self.getTitle(), language=getattr(self, 'docLanguage', 'x'), corpus=self.getCorpus()) def get_text(elem): """ used only to extract documents title... """ res=u'' for line in elem.childNodes: res += line.nodeValue return res.encode('utf-8') Globals.InitializeClass(Document)