Package Gnumed :: Package pycommon :: Module gmMatchProvider
[frames] | no frames]

Source Code for Module Gnumed.pycommon.gmMatchProvider

  1   
  2  __doc__ = """Base classes for match providers. 
  3   
  4  They are used by business objects to give 
  5  phrasewheels the ability to guess phrases. 
  6   
  7  Copyright (C) GNUMed developers 
  8  license: GPL v2 or later 
  9  """ 
 10  __author__  = "K.Hilbert <Karsten.Hilbert@gmx.net>, I.Haywood <ihaywood@gnu.org>, S.J.Tan <sjtan@bigpond.com>" 
 11   
 12  # std lib 
 13  import sys 
 14  import logging 
 15  import re as regex 
 16  import datetime as pydt 
 17   
 18   
 19  # GNUmed 
 20  if __name__ == "__main__": 
 21          sys.path.insert(0, '../../') 
 22  from Gnumed.pycommon import gmPG2 
 23   
 24   
 25  _log = logging.getLogger('gm.ui') 
 26   
 27   
 28  # these are stripped from the fragment passed to the 
 29  # match provider before looking for matches: 
 30  default_ignored_chars = "[?!.'\\(){}\[\]<>~#*$%^_]+" + '"' 
 31   
 32  # these are used to detect word boundaries which is, 
 33  # in turn, used to normalize word boundaries in the 
 34  # input fragment 
 35  default_word_separators = '[- \t=+&:@]+' 
 36  #============================================================ 
37 -class cMatchProvider(object):
38 """Base class for match providing objects. 39 40 Match sources might be: 41 - database tables 42 - flat files 43 - previous input 44 - config files 45 - in-memory list created on the fly 46 """ 47 print_queries = False 48 #--------------------------------------------------------
49 - def __init__(self):
50 self.setThresholds() 51 52 self._context_vals = {} 53 self.__ignored_chars = regex.compile(default_ignored_chars) 54 # used to normalize word boundaries: 55 self.__word_separators = regex.compile(default_word_separators)
56 #-------------------------------------------------------- 57 # actions 58 #--------------------------------------------------------
59 - def getMatches(self, aFragment = None):
60 """Return matches according to aFragment and matching thresholds. 61 62 FIXME: design decision: we dont worry about data source changes 63 during the lifetime of a MatchProvider 64 FIXME: append _("*get all items*") on truncation 65 """ 66 # sanity check 67 if aFragment is None: 68 raise ValueError('Cannot find matches without a fragment.') 69 70 # user explicitly wants all matches 71 if aFragment == '*': 72 return self.getAllMatches() 73 74 # case insensitivity 75 tmpFragment = aFragment.lower() 76 # remove ignored chars 77 if self.__ignored_chars is not None: 78 tmpFragment = self.__ignored_chars.sub('', tmpFragment) 79 # normalize word separators 80 if self.__word_separators is not None: 81 tmpFragment = ' '.join(self.__word_separators.split(tmpFragment)) 82 # length in number of significant characters only 83 lngFragment = len(tmpFragment) 84 85 # order is important ! 86 if lngFragment >= self.__threshold_substring: 87 return self.getMatchesBySubstr(tmpFragment) 88 elif lngFragment >= self.__threshold_word: 89 return self.getMatchesByWord(tmpFragment) 90 elif lngFragment >= self.__threshold_phrase: 91 return self.getMatchesByPhrase(tmpFragment) 92 else: 93 return (False, [])
94 #--------------------------------------------------------
95 - def getAllMatches(self):
96 raise NotImplementedError
97 #--------------------------------------------------------
98 - def getMatchesByPhrase(self, aFragment):
99 raise NotImplementedError
100 #--------------------------------------------------------
101 - def getMatchesByWord(self, aFragment):
102 raise NotImplementedError
103 #--------------------------------------------------------
104 - def getMatchesBySubstr(self, aFragment):
105 raise NotImplementedError
106 #--------------------------------------------------------
107 - def get_match_by_data(self, data=None):
108 return None
109 #-------------------------------------------------------- 110 # configuration 111 #--------------------------------------------------------
112 - def setThresholds(self, aPhrase = 1, aWord = 3, aSubstring = 5):
113 """Set match location thresholds. 114 115 - the fragment passed to getMatches() must contain at least this many 116 characters before it triggers a match search at: 117 1) phrase_start - start of phrase (first word) 118 2) word_start - start of any word within phrase 119 3) in_word - _inside_ any word within phrase 120 """ 121 # sanity checks 122 if aSubstring < aWord: 123 _log.error('Setting substring threshold (%s) lower than word-start threshold (%s) does not make sense. Retaining original thresholds (%s:%s, respectively).' % (aSubstring, aWord, self.__threshold_substring, self.__threshold_word)) 124 return False 125 if aWord < aPhrase: 126 _log.error('Setting word-start threshold (%s) lower than phrase-start threshold (%s) does not make sense. Retaining original thresholds (%s:%s, respectively).' % (aSubstring, aWord, self.__threshold_word, self.__threshold_phrase)) 127 return False 128 129 # now actually reassign thresholds 130 self.__threshold_phrase = aPhrase 131 self.__threshold_word = aWord 132 self.__threshold_substring = aSubstring 133 134 return True
135 #--------------------------------------------------------
136 - def _set_word_separators(self, word_separators=None):
137 if word_separators is None: 138 self.__word_separators = None 139 else: 140 self.__word_separators = regex.compile(word_separators)
141
142 - def _get_word_separators(self):
143 if self.__word_separators is None: 144 return None 145 return self.__word_separators.pattern
146 147 word_separators = property(_get_word_separators, _set_word_separators) 148 #--------------------------------------------------------
149 - def _set_ignored_chars(self, ignored_chars=None):
150 if ignored_chars is None: 151 self.__ignored_chars = None 152 else: 153 self.__ignored_chars = regex.compile(ignored_chars)
154
155 - def _get_ignored_chars(self):
156 if self.__ignored_chars is None: 157 return None 158 return self.__ignored_chars.pattern
159 160 ignored_chars = property(_get_ignored_chars, _set_ignored_chars) 161 #--------------------------------------------------------
162 - def set_context (self, context=None, val=None):
163 """Set value to provide context information for matches. 164 165 The matching code may ignore it depending on its exact 166 implementation. Names and values of the context depend 167 on what is being matched. 168 169 <context> -- the *placeholder* key *inside* the context 170 definition, not the context *definition* key 171 """ 172 if context is None: 173 return False 174 self._context_vals[context] = val 175 return True
176 #--------------------------------------------------------
177 - def unset_context(self, context=None):
178 try: 179 del self._context_vals[context] 180 except KeyError: 181 pass
182 #------------------------------------------------------------ 183 # usable instances 184 #------------------------------------------------------------
185 -class cMatchProvider_FixedList(cMatchProvider):
186 """Match provider where all possible options can be held 187 in a reasonably sized, pre-allocated list. 188 """
189 - def __init__(self, aSeq = None):
190 """aSeq must be a list of dicts. Each dict must have the keys (data, label, weight) 191 """ 192 if not type(aSeq) in [type(None), list, tuple]: 193 _log.error('fixed list match provider argument must be a list/tuple of dicts/None') 194 raise TypeError('fixed list match provider argument must be a list/tuple of dicts/None') 195 196 self.__items = aSeq 197 cMatchProvider.__init__(self)
198 199 #-------------------------------------------------------- 200 # internal matching algorithms 201 # 202 # if we end up here: 203 # - aFragment will not be "None" 204 # - aFragment will be lower case 205 # - we _do_ deliver matches (whether we find any is a different story) 206 #--------------------------------------------------------
207 - def getMatchesByPhrase(self, aFragment):
208 """Return matches for aFragment at start of phrases.""" 209 matches = [] 210 # look for matches 211 for item in self.__items: 212 # at start of phrase, that is 213 if item['list_label'].lower().startswith(aFragment.lower()): 214 matches.append(item) 215 # no matches found 216 if len(matches) == 0: 217 return (False, []) 218 219 #matches.sort(self.__cmp_items) 220 matches.sort(key = lambda x: x['weight'], reverse = True) 221 return (True, matches)
222 223 #--------------------------------------------------------
224 - def getMatchesByWord(self, aFragment):
225 """Return matches for aFragment at start of words inside phrases.""" 226 matches = [] 227 # look for matches 228 for item in self.__items: 229 item_label = item['list_label'].lower() 230 fragment_pos = item_label.find(aFragment.lower()) 231 # found at start of phrase 232 if fragment_pos == 0: 233 matches.append(item) 234 # found as a true substring 235 elif fragment_pos > 0: 236 # but use only if substring is at start of a word 237 if item_label[fragment_pos-1] == ' ': 238 matches.append(item) 239 # no matches found 240 if len(matches) == 0: 241 return (False, []) 242 243 #matches.sort(self.__cmp_items) 244 matches.sort(key = lambda x: x['weight'], reverse = True) 245 return (True, matches)
246 247 #--------------------------------------------------------
248 - def getMatchesBySubstr(self, aFragment):
249 """Return matches for aFragment as a true substring.""" 250 matches = [] 251 # look for matches 252 for item in self.__items: 253 if item['list_label'].lower().find(aFragment.lower()) != -1: 254 matches.append(item) 255 # no matches found 256 if len(matches) == 0: 257 return (False, []) 258 259 #matches.sort(self.__cmp_items) 260 matches.sort(key = lambda x: x['weight'], reverse = True) 261 return (True, matches)
262 263 #--------------------------------------------------------
264 - def getAllMatches(self):
265 """Return all items.""" 266 matches = self.__items 267 # no matches found 268 if len(matches) == 0: 269 return (False, []) 270 271 #matches.sort(self.__cmp_items) 272 matches.sort(key = lambda x: x['weight'], reverse = True) 273 return (True, matches)
274 275 #--------------------------------------------------------
276 - def set_items(self, items):
277 """items must be a list of dicts. Each dict must have the keys (data, list_label, weight)""" 278 self.__items = items
279 280 # #-------------------------------------------------------- 281 # def __cmp_items(self, item1, item2): 282 # """Compare items based on weight.""" 283 # if item1['weight'] == item2['weight']: 284 # return 0 285 # 286 # # do it the wrong way round to do sorting/reversing at once 287 # if item1['weight'] < item2['weight']: 288 # return 1 289 # if item1['weight'] > item2['weight']: 290 # return -1 291 292 # ===========================================================
293 -class cMatchProvider_Func(cMatchProvider):
294 """Match provider which searches matches 295 in the results of a function call. 296 """
297 - def __init__(self, get_candidates = None):
298 """get_candidates() must return a list of strings.""" 299 if get_candidates is None: 300 _log.error('must define function to retrieve match candidates list') 301 raise ValueError('must define function to retrieve match candidates list') 302 303 self._get_candidates = get_candidates 304 cMatchProvider.__init__(self)
305 #-------------------------------------------------------- 306 # internal matching algorithms 307 # 308 # if we end up here: 309 # - aFragment will not be "None" 310 # - aFragment will be lower case 311 # - we _do_ deliver matches (whether we find any is a different story) 312 #--------------------------------------------------------
313 - def getMatchesByPhrase(self, aFragment):
314 """Return matches for aFragment at start of phrases.""" 315 matches = [] 316 candidates = self._get_candidates() 317 # look for matches 318 for candidate in candidates: 319 # at start of phrase, that is 320 if aFragment.startswith(candidate['list_label'].lower()): 321 matches.append(candidate) 322 # no matches found 323 if len(matches) == 0: 324 return (False, []) 325 326 matches.sort(key = self.__cmp_candidates) 327 return (True, matches)
328 #--------------------------------------------------------
329 - def getMatchesByWord(self, aFragment):
330 """Return matches for aFragment at start of words inside phrases.""" 331 matches = [] 332 candidates = self._get_candidates() 333 # look for matches 334 for candidate in candidates: 335 pos = candidate['list_label'].lower().find(aFragment) 336 # pos = string.find(string.lower(candidate['list_label']), aFragment) 337 # found as a true substring 338 # but use only if substring is at start of a word 339 # FIXME: use word seps 340 if (pos == 0) or (candidate['list_label'][pos-1] == ' '): 341 matches.append(candidate) 342 # no matches found 343 if len(matches) == 0: 344 return (False, []) 345 346 matches.sort(key = self.__cmp_candidates) 347 return (True, matches)
348 #--------------------------------------------------------
349 - def getMatchesBySubstr(self, aFragment):
350 """Return matches for aFragment as a true substring.""" 351 matches = [] 352 candidates = self._get_candidates() 353 # look for matches 354 for candidate in candidates: 355 if candidate['list_label'].lower().find(aFragment) != -1: 356 # if string.find(string.lower(candidate['list_label']), aFragment) != -1: 357 matches.append(candidate) 358 # no matches found 359 if len(matches) == 0: 360 return (False, []) 361 362 matches.sort(key = self.__cmp_candidates) 363 return (True, matches)
364 #--------------------------------------------------------
365 - def getAllMatches(self):
366 """Return all candidates.""" 367 return self._get_candidates()
368 #-------------------------------------------------------- 369 #def __cmp_candidates(self, candidate1, candidate2):
370 - def __cmp_candidates(self, candidate):
371 """naive ordering""" 372 return 0
373 # FIXME: do ordering 374 # if candidate1 < candidate2: 375 # return -1 376 # if candidate1 == candidate2: 377 # return 0 378 # return 1 379 380 # ===========================================================
381 -class cMatchProvider_SQL2(cMatchProvider):
382 """Match provider which searches matches 383 in possibly several database tables. 384 385 queries: 386 - a list of unicode strings 387 - each string is a query 388 - each string must contain: "... WHERE <column> %(fragment_condition)s ..." 389 - each string can contain in the where clause: "... %(<ctxt_key1>)s ..." 390 - each query must return (data, list_label, field_label) 391 392 context definitions to be used in the queries, example: 393 {'ctxt_key1': {'where_part': 'AND country = %(country)s', 'placeholder': 'country'}} 394 395 client code using .set_context() must use the 'placeholder': 396 <phrasewheel>/<match provider>.set_context('country', 'Germany') 397 398 full example query: 399 400 query = u" " " 401 SELECT DISTINCT ON (list_label) 402 pk_encounter 403 AS data, 404 to_char(started, 'YYYY Mon DD (HH24:MI)') || ': ' || l10n_type || ' [#' || pk_encounter || ']' 405 AS list_label, 406 to_char(started, 'YYYY Mon DD') || ': ' || l10n_type 407 AS field_label 408 FROM 409 clin.v_pat_encounters 410 WHERE 411 ( 412 l10n_type %(fragment_condition)s 413 OR 414 type %(fragment_condition)s 415 ) %(ctxt_patient)s 416 ORDER BY 417 list_label 418 LIMIT 419 30 420 " " " 421 context = {'ctxt_patient': { 422 'where_part': u'AND pk_patient = %(PLACEHOLDER)s', 423 'placeholder': u'PLACEHOLDER' 424 }} 425 self.mp = gmMatchProvider.cMatchProvider_SQL2(queries = query, context = context) 426 self.set_context(context = 'PLACEHOLDER', val = '<THE VALUE>') 427 428 _SQL_data2match: 429 SQL to retrieve a match by, say, primary key 430 wherein the only keyword argument is 'pk' 431 """
432 - def __init__(self, queries = None, context = None):
433 434 cMatchProvider.__init__(self) 435 436 if type(queries) == type([]): 437 self._queries = queries 438 else: 439 self._queries = [queries] 440 441 if context is None: 442 self._context = {} 443 else: 444 self._context = context 445 446 self._args = {} 447 448 self._SQL_data2match = None
449 450 #-------------------------------------------------------- 451 # internal matching algorithms 452 # 453 # if we end up here: 454 # - aFragment will not be "None" 455 # - aFragment will be lower case 456 # - we _do_ deliver matches (whether we find any is a different story) 457 #--------------------------------------------------------
458 - def getMatchesByPhrase(self, aFragment):
459 """Return matches for aFragment at start of phrases.""" 460 461 fragment_condition = "ILIKE %(fragment)s" 462 self._args['fragment'] = "%s%%" % aFragment 463 464 return self._find_matches(fragment_condition)
465 466 #--------------------------------------------------------
467 - def getMatchesByWord(self, aFragment):
468 """Return matches for aFragment at start of words inside phrases.""" 469 470 fragment_condition = "~* %(fragment)s" 471 aFragment = gmPG2.sanitize_pg_regex(expression = aFragment, escape_all = False) 472 self._args['fragment'] = "( %s)|(^%s)" % (aFragment, aFragment) 473 474 return self._find_matches(fragment_condition)
475 476 #--------------------------------------------------------
477 - def getMatchesBySubstr(self, aFragment):
478 """Return matches for aFragment as a true substring.""" 479 480 fragment_condition = "ILIKE %(fragment)s" 481 self._args['fragment'] = "%%%s%%" % aFragment 482 483 return self._find_matches(fragment_condition)
484 485 #--------------------------------------------------------
486 - def getAllMatches(self):
487 """Return all items.""" 488 return self.getMatchesBySubstr('')
489 490 #--------------------------------------------------------
491 - def get_match_by_data(self, data=None):
492 if self._SQL_data2match is None: 493 return None 494 495 query = {'cmd': self._SQL_data2match, 'args': {'pk': data}} 496 try: 497 rows, idx = gmPG2.run_ro_queries(queries = [query], get_col_idx = False) 498 except: 499 _log.exception('[%s]: error running _SQL_data2match, dropping query', self.__class__.__name__) 500 self._SQL_data2match = None 501 return None 502 503 # hopefully the most frequent case: 504 if len(rows) == 1: 505 return rows[0] 506 507 _log.error('[%s]: 0 or >1 rows found by running _SQL_data2match, ambiguous, ignoring', self.__class__.__name__) 508 return None
509 510 #--------------------------------------------------------
511 - def _rows2matches(self, rows):
512 """Turns retrieved database values into a list 513 of dicts fit for phrasewheel use. 514 515 This method can be overridden to massage arbitrary 516 data into the proper list of dicts. 517 """ 518 matches = [] 519 for row in rows: 520 # PRW wants a weight 521 match = {'weight': 0} 522 try: 523 match['data'] = row['data'] 524 except KeyError: 525 match['data'] = row[0] 526 try: 527 match['list_label'] = row['list_label'] 528 except KeyError: 529 match['list_label'] = row[1] 530 # explicit "field_label" in result ? 531 try: 532 match['field_label'] = row['field_label'] 533 # no 534 except KeyError: 535 # but does row[2] exist ? 536 try: 537 match['field_label'] = row[2] 538 # no: reuse "list_label" 539 except IndexError: 540 match['field_label'] = match['list_label'] 541 matches.append(match) 542 543 return matches
544 545 #--------------------------------------------------------
546 - def _find_matches(self, fragment_condition):
547 """Loads matching data from PostgreSQL and turns them into 548 matches fit for consumption by a phrasewheel. 549 """ 550 if self.print_queries: 551 print("----------------------") 552 print(pydt.datetime.now()) 553 554 matches = [] 555 for query in self._queries: 556 where_fragments = {'fragment_condition': fragment_condition} 557 558 for context_key, context_def in self._context.items(): 559 try: 560 placeholder = context_def['placeholder'] 561 where_part = context_def['where_part'] 562 self._args[placeholder] = self._context_vals[placeholder] 563 # we do have a context value for this key, so add the where condition 564 where_fragments[context_key] = where_part 565 if self.print_queries: 566 print("ctxt ph:", placeholder) 567 print("ctxt where:", where_part) 568 print("ctxt val:", self._context_vals[placeholder]) 569 except KeyError: 570 # we don't have a context value for this key, so skip the where condition 571 where_fragments[context_key] = '' 572 if self.print_queries: 573 print("invalid ctxt key:", context_key) 574 575 cmd = query % where_fragments 576 577 if self.print_queries: 578 print("class:", self.__class__.__name__) 579 print("ctxt:", self._context_vals) 580 print("args:", self._args) 581 print("query:", cmd) 582 583 try: 584 rows, idx = gmPG2.run_ro_queries(queries = [{'cmd': cmd, 'args': self._args}], get_col_idx = False) 585 except gmPG2.PG_ERROR_EXCEPTION: 586 _log.exception('[%s]: error running match provider SQL, dropping query', self.__class__.__name__) 587 idx = self._queries.index(query) 588 del self._queries[idx] 589 break 590 # no matches found: try next query 591 if len(rows) == 0: 592 continue 593 matches = self._rows2matches(rows) 594 return (True, matches) 595 # none found whatsoever 596 return (False, [])
597 598 #================================================================ 599 if __name__ == '__main__': 600 pass 601