util.py 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845
  1. # Natural Language Toolkit: Utility functions
  2. #
  3. # Copyright (C) 2001-2020 NLTK Project
  4. # Author: Steven Bird <stevenbird1@gmail.com>
  5. # URL: <http://nltk.org/>
  6. # For license information, see LICENSE.TXT
  7. import sys
  8. import inspect
  9. import locale
  10. import re
  11. import types
  12. import textwrap
  13. import pydoc
  14. import bisect
  15. import os
  16. from itertools import islice, chain, combinations, tee
  17. from pprint import pprint
  18. from collections import defaultdict, deque
  19. from sys import version_info
  20. from urllib.request import (
  21. build_opener,
  22. install_opener,
  23. getproxies,
  24. ProxyHandler,
  25. ProxyBasicAuthHandler,
  26. ProxyDigestAuthHandler,
  27. HTTPPasswordMgrWithDefaultRealm,
  28. )
  29. from nltk.internals import slice_bounds, raise_unorderable_types
  30. from nltk.collections import *
  31. ######################################################################
  32. # Short usage message
  33. ######################################################################
  34. def usage(obj, selfname="self"):
  35. str(obj) # In case it's lazy, this will load it.
  36. if not isinstance(obj, type):
  37. obj = obj.__class__
  38. print("%s supports the following operations:" % obj.__name__)
  39. for (name, method) in sorted(pydoc.allmethods(obj).items()):
  40. if name.startswith("_"):
  41. continue
  42. if getattr(method, "__deprecated__", False):
  43. continue
  44. getargspec = inspect.getfullargspec
  45. args, varargs, varkw, defaults = getargspec(method)[:4]
  46. if (
  47. args
  48. and args[0] == "self"
  49. and (defaults is None or len(args) > len(defaults))
  50. ):
  51. args = args[1:]
  52. name = "%s.%s" % (selfname, name)
  53. argspec = inspect.formatargspec(args, varargs, varkw, defaults)
  54. print(
  55. textwrap.fill(
  56. "%s%s" % (name, argspec),
  57. initial_indent=" - ",
  58. subsequent_indent=" " * (len(name) + 5),
  59. )
  60. )
  61. ##########################################################################
  62. # IDLE
  63. ##########################################################################
  64. def in_idle():
  65. """
  66. Return True if this function is run within idle. Tkinter
  67. programs that are run in idle should never call ``Tk.mainloop``; so
  68. this function should be used to gate all calls to ``Tk.mainloop``.
  69. :warning: This function works by checking ``sys.stdin``. If the
  70. user has modified ``sys.stdin``, then it may return incorrect
  71. results.
  72. :rtype: bool
  73. """
  74. import sys
  75. return sys.stdin.__class__.__name__ in ("PyShell", "RPCProxy")
  76. ##########################################################################
  77. # PRETTY PRINTING
  78. ##########################################################################
  79. def pr(data, start=0, end=None):
  80. """
  81. Pretty print a sequence of data items
  82. :param data: the data stream to print
  83. :type data: sequence or iter
  84. :param start: the start position
  85. :type start: int
  86. :param end: the end position
  87. :type end: int
  88. """
  89. pprint(list(islice(data, start, end)))
  90. def print_string(s, width=70):
  91. """
  92. Pretty print a string, breaking lines on whitespace
  93. :param s: the string to print, consisting of words and spaces
  94. :type s: str
  95. :param width: the display width
  96. :type width: int
  97. """
  98. print("\n".join(textwrap.wrap(s, width=width)))
  99. def tokenwrap(tokens, separator=" ", width=70):
  100. """
  101. Pretty print a list of text tokens, breaking lines on whitespace
  102. :param tokens: the tokens to print
  103. :type tokens: list
  104. :param separator: the string to use to separate tokens
  105. :type separator: str
  106. :param width: the display width (default=70)
  107. :type width: int
  108. """
  109. return "\n".join(textwrap.wrap(separator.join(tokens), width=width))
  110. ##########################################################################
  111. # Python version
  112. ##########################################################################
  113. def py25():
  114. return version_info[0] == 2 and version_info[1] == 5
  115. def py26():
  116. return version_info[0] == 2 and version_info[1] == 6
  117. def py27():
  118. return version_info[0] == 2 and version_info[1] == 7
  119. ##########################################################################
  120. # Indexing
  121. ##########################################################################
  122. class Index(defaultdict):
  123. def __init__(self, pairs):
  124. defaultdict.__init__(self, list)
  125. for key, value in pairs:
  126. self[key].append(value)
  127. ######################################################################
  128. ## Regexp display (thanks to David Mertz)
  129. ######################################################################
  130. def re_show(regexp, string, left="{", right="}"):
  131. """
  132. Return a string with markers surrounding the matched substrings.
  133. Search str for substrings matching ``regexp`` and wrap the matches
  134. with braces. This is convenient for learning about regular expressions.
  135. :param regexp: The regular expression.
  136. :type regexp: str
  137. :param string: The string being matched.
  138. :type string: str
  139. :param left: The left delimiter (printed before the matched substring)
  140. :type left: str
  141. :param right: The right delimiter (printed after the matched substring)
  142. :type right: str
  143. :rtype: str
  144. """
  145. print(re.compile(regexp, re.M).sub(left + r"\g<0>" + right, string.rstrip()))
  146. ##########################################################################
  147. # READ FROM FILE OR STRING
  148. ##########################################################################
  149. # recipe from David Mertz
  150. def filestring(f):
  151. if hasattr(f, "read"):
  152. return f.read()
  153. elif isinstance(f, str):
  154. with open(f, "r") as infile:
  155. return infile.read()
  156. else:
  157. raise ValueError("Must be called with a filename or file-like object")
  158. ##########################################################################
  159. # Breadth-First Search
  160. ##########################################################################
  161. def breadth_first(tree, children=iter, maxdepth=-1):
  162. """Traverse the nodes of a tree in breadth-first order.
  163. (No need to check for cycles.)
  164. The first argument should be the tree root;
  165. children should be a function taking as argument a tree node
  166. and returning an iterator of the node's children.
  167. """
  168. queue = deque([(tree, 0)])
  169. while queue:
  170. node, depth = queue.popleft()
  171. yield node
  172. if depth != maxdepth:
  173. try:
  174. queue.extend((c, depth + 1) for c in children(node))
  175. except TypeError:
  176. pass
  177. ##########################################################################
  178. # Guess Character Encoding
  179. ##########################################################################
  180. # adapted from io.py in the docutils extension module (http://docutils.sourceforge.net)
  181. # http://www.pyzine.com/Issue008/Section_Articles/article_Encodings.html
  182. def guess_encoding(data):
  183. """
  184. Given a byte string, attempt to decode it.
  185. Tries the standard 'UTF8' and 'latin-1' encodings,
  186. Plus several gathered from locale information.
  187. The calling program *must* first call::
  188. locale.setlocale(locale.LC_ALL, '')
  189. If successful it returns ``(decoded_unicode, successful_encoding)``.
  190. If unsuccessful it raises a ``UnicodeError``.
  191. """
  192. successful_encoding = None
  193. # we make 'utf-8' the first encoding
  194. encodings = ["utf-8"]
  195. #
  196. # next we add anything we can learn from the locale
  197. try:
  198. encodings.append(locale.nl_langinfo(locale.CODESET))
  199. except AttributeError:
  200. pass
  201. try:
  202. encodings.append(locale.getlocale()[1])
  203. except (AttributeError, IndexError):
  204. pass
  205. try:
  206. encodings.append(locale.getdefaultlocale()[1])
  207. except (AttributeError, IndexError):
  208. pass
  209. #
  210. # we try 'latin-1' last
  211. encodings.append("latin-1")
  212. for enc in encodings:
  213. # some of the locale calls
  214. # may have returned None
  215. if not enc:
  216. continue
  217. try:
  218. decoded = str(data, enc)
  219. successful_encoding = enc
  220. except (UnicodeError, LookupError):
  221. pass
  222. else:
  223. break
  224. if not successful_encoding:
  225. raise UnicodeError(
  226. "Unable to decode input data. "
  227. "Tried the following encodings: %s."
  228. % ", ".join([repr(enc) for enc in encodings if enc])
  229. )
  230. else:
  231. return (decoded, successful_encoding)
  232. ##########################################################################
  233. # Remove repeated elements from a list deterministcally
  234. ##########################################################################
  235. def unique_list(xs):
  236. seen = set()
  237. # not seen.add(x) here acts to make the code shorter without using if statements, seen.add(x) always returns None.
  238. return [x for x in xs if x not in seen and not seen.add(x)]
  239. ##########################################################################
  240. # Invert a dictionary
  241. ##########################################################################
  242. def invert_dict(d):
  243. inverted_dict = defaultdict(list)
  244. for key in d:
  245. if hasattr(d[key], "__iter__"):
  246. for term in d[key]:
  247. inverted_dict[term].append(key)
  248. else:
  249. inverted_dict[d[key]] = key
  250. return inverted_dict
  251. ##########################################################################
  252. # Utilities for directed graphs: transitive closure, and inversion
  253. # The graph is represented as a dictionary of sets
  254. ##########################################################################
  255. def transitive_closure(graph, reflexive=False):
  256. """
  257. Calculate the transitive closure of a directed graph,
  258. optionally the reflexive transitive closure.
  259. The algorithm is a slight modification of the "Marking Algorithm" of
  260. Ioannidis & Ramakrishnan (1998) "Efficient Transitive Closure Algorithms".
  261. :param graph: the initial graph, represented as a dictionary of sets
  262. :type graph: dict(set)
  263. :param reflexive: if set, also make the closure reflexive
  264. :type reflexive: bool
  265. :rtype: dict(set)
  266. """
  267. if reflexive:
  268. base_set = lambda k: set([k])
  269. else:
  270. base_set = lambda k: set()
  271. # The graph U_i in the article:
  272. agenda_graph = dict((k, graph[k].copy()) for k in graph)
  273. # The graph M_i in the article:
  274. closure_graph = dict((k, base_set(k)) for k in graph)
  275. for i in graph:
  276. agenda = agenda_graph[i]
  277. closure = closure_graph[i]
  278. while agenda:
  279. j = agenda.pop()
  280. closure.add(j)
  281. closure |= closure_graph.setdefault(j, base_set(j))
  282. agenda |= agenda_graph.get(j, base_set(j))
  283. agenda -= closure
  284. return closure_graph
  285. def invert_graph(graph):
  286. """
  287. Inverts a directed graph.
  288. :param graph: the graph, represented as a dictionary of sets
  289. :type graph: dict(set)
  290. :return: the inverted graph
  291. :rtype: dict(set)
  292. """
  293. inverted = {}
  294. for key in graph:
  295. for value in graph[key]:
  296. inverted.setdefault(value, set()).add(key)
  297. return inverted
  298. ##########################################################################
  299. # HTML Cleaning
  300. ##########################################################################
  301. def clean_html(html):
  302. raise NotImplementedError(
  303. "To remove HTML markup, use BeautifulSoup's get_text() function"
  304. )
  305. def clean_url(url):
  306. raise NotImplementedError(
  307. "To remove HTML markup, use BeautifulSoup's get_text() function"
  308. )
  309. ##########################################################################
  310. # FLATTEN LISTS
  311. ##########################################################################
  312. def flatten(*args):
  313. """
  314. Flatten a list.
  315. >>> from nltk.util import flatten
  316. >>> flatten(1, 2, ['b', 'a' , ['c', 'd']], 3)
  317. [1, 2, 'b', 'a', 'c', 'd', 3]
  318. :param args: items and lists to be combined into a single list
  319. :rtype: list
  320. """
  321. x = []
  322. for l in args:
  323. if not isinstance(l, (list, tuple)):
  324. l = [l]
  325. for item in l:
  326. if isinstance(item, (list, tuple)):
  327. x.extend(flatten(item))
  328. else:
  329. x.append(item)
  330. return x
  331. ##########################################################################
  332. # Ngram iteration
  333. ##########################################################################
  334. def pad_sequence(
  335. sequence,
  336. n,
  337. pad_left=False,
  338. pad_right=False,
  339. left_pad_symbol=None,
  340. right_pad_symbol=None,
  341. ):
  342. """
  343. Returns a padded sequence of items before ngram extraction.
  344. >>> list(pad_sequence([1,2,3,4,5], 2, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>'))
  345. ['<s>', 1, 2, 3, 4, 5, '</s>']
  346. >>> list(pad_sequence([1,2,3,4,5], 2, pad_left=True, left_pad_symbol='<s>'))
  347. ['<s>', 1, 2, 3, 4, 5]
  348. >>> list(pad_sequence([1,2,3,4,5], 2, pad_right=True, right_pad_symbol='</s>'))
  349. [1, 2, 3, 4, 5, '</s>']
  350. :param sequence: the source data to be padded
  351. :type sequence: sequence or iter
  352. :param n: the degree of the ngrams
  353. :type n: int
  354. :param pad_left: whether the ngrams should be left-padded
  355. :type pad_left: bool
  356. :param pad_right: whether the ngrams should be right-padded
  357. :type pad_right: bool
  358. :param left_pad_symbol: the symbol to use for left padding (default is None)
  359. :type left_pad_symbol: any
  360. :param right_pad_symbol: the symbol to use for right padding (default is None)
  361. :type right_pad_symbol: any
  362. :rtype: sequence or iter
  363. """
  364. sequence = iter(sequence)
  365. if pad_left:
  366. sequence = chain((left_pad_symbol,) * (n - 1), sequence)
  367. if pad_right:
  368. sequence = chain(sequence, (right_pad_symbol,) * (n - 1))
  369. return sequence
  370. # add a flag to pad the sequence so we get peripheral ngrams?
  371. def ngrams(
  372. sequence,
  373. n,
  374. pad_left=False,
  375. pad_right=False,
  376. left_pad_symbol=None,
  377. right_pad_symbol=None,
  378. ):
  379. """
  380. Return the ngrams generated from a sequence of items, as an iterator.
  381. For example:
  382. >>> from nltk.util import ngrams
  383. >>> list(ngrams([1,2,3,4,5], 3))
  384. [(1, 2, 3), (2, 3, 4), (3, 4, 5)]
  385. Wrap with list for a list version of this function. Set pad_left
  386. or pad_right to true in order to get additional ngrams:
  387. >>> list(ngrams([1,2,3,4,5], 2, pad_right=True))
  388. [(1, 2), (2, 3), (3, 4), (4, 5), (5, None)]
  389. >>> list(ngrams([1,2,3,4,5], 2, pad_right=True, right_pad_symbol='</s>'))
  390. [(1, 2), (2, 3), (3, 4), (4, 5), (5, '</s>')]
  391. >>> list(ngrams([1,2,3,4,5], 2, pad_left=True, left_pad_symbol='<s>'))
  392. [('<s>', 1), (1, 2), (2, 3), (3, 4), (4, 5)]
  393. >>> list(ngrams([1,2,3,4,5], 2, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>'))
  394. [('<s>', 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, '</s>')]
  395. :param sequence: the source data to be converted into ngrams
  396. :type sequence: sequence or iter
  397. :param n: the degree of the ngrams
  398. :type n: int
  399. :param pad_left: whether the ngrams should be left-padded
  400. :type pad_left: bool
  401. :param pad_right: whether the ngrams should be right-padded
  402. :type pad_right: bool
  403. :param left_pad_symbol: the symbol to use for left padding (default is None)
  404. :type left_pad_symbol: any
  405. :param right_pad_symbol: the symbol to use for right padding (default is None)
  406. :type right_pad_symbol: any
  407. :rtype: sequence or iter
  408. """
  409. sequence = pad_sequence(
  410. sequence, n, pad_left, pad_right, left_pad_symbol, right_pad_symbol
  411. )
  412. history = []
  413. while n > 1:
  414. # PEP 479, prevent RuntimeError from being raised when StopIteration bubbles out of generator
  415. try:
  416. next_item = next(sequence)
  417. except StopIteration:
  418. # no more data, terminate the generator
  419. return
  420. history.append(next_item)
  421. n -= 1
  422. for item in sequence:
  423. history.append(item)
  424. yield tuple(history)
  425. del history[0]
  426. def bigrams(sequence, **kwargs):
  427. """
  428. Return the bigrams generated from a sequence of items, as an iterator.
  429. For example:
  430. >>> from nltk.util import bigrams
  431. >>> list(bigrams([1,2,3,4,5]))
  432. [(1, 2), (2, 3), (3, 4), (4, 5)]
  433. Use bigrams for a list version of this function.
  434. :param sequence: the source data to be converted into bigrams
  435. :type sequence: sequence or iter
  436. :rtype: iter(tuple)
  437. """
  438. for item in ngrams(sequence, 2, **kwargs):
  439. yield item
  440. def trigrams(sequence, **kwargs):
  441. """
  442. Return the trigrams generated from a sequence of items, as an iterator.
  443. For example:
  444. >>> from nltk.util import trigrams
  445. >>> list(trigrams([1,2,3,4,5]))
  446. [(1, 2, 3), (2, 3, 4), (3, 4, 5)]
  447. Use trigrams for a list version of this function.
  448. :param sequence: the source data to be converted into trigrams
  449. :type sequence: sequence or iter
  450. :rtype: iter(tuple)
  451. """
  452. for item in ngrams(sequence, 3, **kwargs):
  453. yield item
  454. def everygrams(sequence, min_len=1, max_len=-1, **kwargs):
  455. """
  456. Returns all possible ngrams generated from a sequence of items, as an iterator.
  457. >>> sent = 'a b c'.split()
  458. >>> list(everygrams(sent))
  459. [('a',), ('b',), ('c',), ('a', 'b'), ('b', 'c'), ('a', 'b', 'c')]
  460. >>> list(everygrams(sent, max_len=2))
  461. [('a',), ('b',), ('c',), ('a', 'b'), ('b', 'c')]
  462. :param sequence: the source data to be converted into trigrams
  463. :type sequence: sequence or iter
  464. :param min_len: minimum length of the ngrams, aka. n-gram order/degree of ngram
  465. :type min_len: int
  466. :param max_len: maximum length of the ngrams (set to length of sequence by default)
  467. :type max_len: int
  468. :rtype: iter(tuple)
  469. """
  470. if max_len == -1:
  471. max_len = len(sequence)
  472. for n in range(min_len, max_len + 1):
  473. for ng in ngrams(sequence, n, **kwargs):
  474. yield ng
  475. def skipgrams(sequence, n, k, **kwargs):
  476. """
  477. Returns all possible skipgrams generated from a sequence of items, as an iterator.
  478. Skipgrams are ngrams that allows tokens to be skipped.
  479. Refer to http://homepages.inf.ed.ac.uk/ballison/pdf/lrec_skipgrams.pdf
  480. >>> sent = "Insurgents killed in ongoing fighting".split()
  481. >>> list(skipgrams(sent, 2, 2))
  482. [('Insurgents', 'killed'), ('Insurgents', 'in'), ('Insurgents', 'ongoing'), ('killed', 'in'), ('killed', 'ongoing'), ('killed', 'fighting'), ('in', 'ongoing'), ('in', 'fighting'), ('ongoing', 'fighting')]
  483. >>> list(skipgrams(sent, 3, 2))
  484. [('Insurgents', 'killed', 'in'), ('Insurgents', 'killed', 'ongoing'), ('Insurgents', 'killed', 'fighting'), ('Insurgents', 'in', 'ongoing'), ('Insurgents', 'in', 'fighting'), ('Insurgents', 'ongoing', 'fighting'), ('killed', 'in', 'ongoing'), ('killed', 'in', 'fighting'), ('killed', 'ongoing', 'fighting'), ('in', 'ongoing', 'fighting')]
  485. :param sequence: the source data to be converted into trigrams
  486. :type sequence: sequence or iter
  487. :param n: the degree of the ngrams
  488. :type n: int
  489. :param k: the skip distance
  490. :type k: int
  491. :rtype: iter(tuple)
  492. """
  493. # Pads the sequence as desired by **kwargs.
  494. if "pad_left" in kwargs or "pad_right" in kwargs:
  495. sequence = pad_sequence(sequence, n, **kwargs)
  496. # Note when iterating through the ngrams, the pad_right here is not
  497. # the **kwargs padding, it's for the algorithm to detect the SENTINEL
  498. # object on the right pad to stop inner loop.
  499. SENTINEL = object()
  500. for ngram in ngrams(sequence, n + k, pad_right=True, right_pad_symbol=SENTINEL):
  501. head = ngram[:1]
  502. tail = ngram[1:]
  503. for skip_tail in combinations(tail, n - 1):
  504. if skip_tail[-1] is SENTINEL:
  505. continue
  506. yield head + skip_tail
  507. ######################################################################
  508. # Binary Search in a File
  509. ######################################################################
  510. # inherited from pywordnet, by Oliver Steele
  511. def binary_search_file(file, key, cache={}, cacheDepth=-1):
  512. """
  513. Return the line from the file with first word key.
  514. Searches through a sorted file using the binary search algorithm.
  515. :type file: file
  516. :param file: the file to be searched through.
  517. :type key: str
  518. :param key: the identifier we are searching for.
  519. """
  520. key = key + " "
  521. keylen = len(key)
  522. start = 0
  523. currentDepth = 0
  524. if hasattr(file, "name"):
  525. end = os.stat(file.name).st_size - 1
  526. else:
  527. file.seek(0, 2)
  528. end = file.tell() - 1
  529. file.seek(0)
  530. while start < end:
  531. lastState = start, end
  532. middle = (start + end) // 2
  533. if cache.get(middle):
  534. offset, line = cache[middle]
  535. else:
  536. line = ""
  537. while True:
  538. file.seek(max(0, middle - 1))
  539. if middle > 0:
  540. file.discard_line()
  541. offset = file.tell()
  542. line = file.readline()
  543. if line != "":
  544. break
  545. # at EOF; try to find start of the last line
  546. middle = (start + middle) // 2
  547. if middle == end - 1:
  548. return None
  549. if currentDepth < cacheDepth:
  550. cache[middle] = (offset, line)
  551. if offset > end:
  552. assert end != middle - 1, "infinite loop"
  553. end = middle - 1
  554. elif line[:keylen] == key:
  555. return line
  556. elif line > key:
  557. assert end != middle - 1, "infinite loop"
  558. end = middle - 1
  559. elif line < key:
  560. start = offset + len(line) - 1
  561. currentDepth += 1
  562. thisState = start, end
  563. if lastState == thisState:
  564. # Detects the condition where we're searching past the end
  565. # of the file, which is otherwise difficult to detect
  566. return None
  567. return None
  568. ######################################################################
  569. # Proxy configuration
  570. ######################################################################
  571. def set_proxy(proxy, user=None, password=""):
  572. """
  573. Set the HTTP proxy for Python to download through.
  574. If ``proxy`` is None then tries to set proxy from environment or system
  575. settings.
  576. :param proxy: The HTTP proxy server to use. For example:
  577. 'http://proxy.example.com:3128/'
  578. :param user: The username to authenticate with. Use None to disable
  579. authentication.
  580. :param password: The password to authenticate with.
  581. """
  582. if proxy is None:
  583. # Try and find the system proxy settings
  584. try:
  585. proxy = getproxies()["http"]
  586. except KeyError:
  587. raise ValueError("Could not detect default proxy settings")
  588. # Set up the proxy handler
  589. proxy_handler = ProxyHandler({"https": proxy, "http": proxy})
  590. opener = build_opener(proxy_handler)
  591. if user is not None:
  592. # Set up basic proxy authentication if provided
  593. password_manager = HTTPPasswordMgrWithDefaultRealm()
  594. password_manager.add_password(realm=None, uri=proxy, user=user, passwd=password)
  595. opener.add_handler(ProxyBasicAuthHandler(password_manager))
  596. opener.add_handler(ProxyDigestAuthHandler(password_manager))
  597. # Overide the existing url opener
  598. install_opener(opener)
  599. ######################################################################
  600. # ElementTree pretty printing from http://www.effbot.org/zone/element-lib.htm
  601. ######################################################################
  602. def elementtree_indent(elem, level=0):
  603. """
  604. Recursive function to indent an ElementTree._ElementInterface
  605. used for pretty printing. Run indent on elem and then output
  606. in the normal way.
  607. :param elem: element to be indented. will be modified.
  608. :type elem: ElementTree._ElementInterface
  609. :param level: level of indentation for this element
  610. :type level: nonnegative integer
  611. :rtype: ElementTree._ElementInterface
  612. :return: Contents of elem indented to reflect its structure
  613. """
  614. i = "\n" + level * " "
  615. if len(elem):
  616. if not elem.text or not elem.text.strip():
  617. elem.text = i + " "
  618. for elem in elem:
  619. elementtree_indent(elem, level + 1)
  620. if not elem.tail or not elem.tail.strip():
  621. elem.tail = i
  622. else:
  623. if level and (not elem.tail or not elem.tail.strip()):
  624. elem.tail = i
  625. ######################################################################
  626. # Mathematical approximations
  627. ######################################################################
  628. def choose(n, k):
  629. """
  630. This function is a fast way to calculate binomial coefficients, commonly
  631. known as nCk, i.e. the number of combinations of n things taken k at a time.
  632. (https://en.wikipedia.org/wiki/Binomial_coefficient).
  633. This is the *scipy.special.comb()* with long integer computation but this
  634. approximation is faster, see https://github.com/nltk/nltk/issues/1181
  635. >>> choose(4, 2)
  636. 6
  637. >>> choose(6, 2)
  638. 15
  639. :param n: The number of things.
  640. :type n: int
  641. :param r: The number of times a thing is taken.
  642. :type r: int
  643. """
  644. if 0 <= k <= n:
  645. ntok, ktok = 1, 1
  646. for t in range(1, min(k, n - k) + 1):
  647. ntok *= n
  648. ktok *= t
  649. n -= 1
  650. return ntok // ktok
  651. else:
  652. return 0
  653. ######################################################################
  654. # Iteration utilities
  655. ######################################################################
  656. def pairwise(iterable):
  657. """s -> (s0,s1), (s1,s2), (s2, s3), ..."""
  658. a, b = tee(iterable)
  659. next(b, None)
  660. return zip(a, b)
  661. ######################################################################
  662. # Parallization.
  663. ######################################################################
  664. def parallelize_preprocess(func, iterator, processes, progress_bar=False):
  665. from tqdm import tqdm
  666. from joblib import Parallel, delayed
  667. iterator = tqdm(iterator) if progress_bar else iterator
  668. if processes <= 1:
  669. return map(func, iterator)
  670. return Parallel(n_jobs=processes)(delayed(func)(line) for line in iterator)