| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548 |
- # Natural Language Toolkit: Corpus & Model Downloader
- #
- # Copyright (C) 2001-2020 NLTK Project
- # Author: Edward Loper <edloper@gmail.com>
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- """
- The NLTK corpus and module downloader. This module defines several
- interfaces which can be used to download corpora, models, and other
- data packages that can be used with NLTK.
- Downloading Packages
- ====================
- If called with no arguments, ``download()`` will display an interactive
- interface which can be used to download and install new packages.
- If Tkinter is available, then a graphical interface will be shown,
- otherwise a simple text interface will be provided.
- Individual packages can be downloaded by calling the ``download()``
- function with a single argument, giving the package identifier for the
- package that should be downloaded:
- >>> download('treebank') # doctest: +SKIP
- [nltk_data] Downloading package 'treebank'...
- [nltk_data] Unzipping corpora/treebank.zip.
- NLTK also provides a number of \"package collections\", consisting of
- a group of related packages. To download all packages in a
- colleciton, simply call ``download()`` with the collection's
- identifier:
- >>> download('all-corpora') # doctest: +SKIP
- [nltk_data] Downloading package 'abc'...
- [nltk_data] Unzipping corpora/abc.zip.
- [nltk_data] Downloading package 'alpino'...
- [nltk_data] Unzipping corpora/alpino.zip.
- ...
- [nltk_data] Downloading package 'words'...
- [nltk_data] Unzipping corpora/words.zip.
- Download Directory
- ==================
- By default, packages are installed in either a system-wide directory
- (if Python has sufficient access to write to it); or in the current
- user's home directory. However, the ``download_dir`` argument may be
- used to specify a different installation target, if desired.
- See ``Downloader.default_download_dir()`` for more a detailed
- description of how the default download directory is chosen.
- NLTK Download Server
- ====================
- Before downloading any packages, the corpus and module downloader
- contacts the NLTK download server, to retrieve an index file
- describing the available packages. By default, this index file is
- loaded from ``https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml``.
- If necessary, it is possible to create a new ``Downloader`` object,
- specifying a different URL for the package index file.
- Usage::
- python nltk/downloader.py [-d DATADIR] [-q] [-f] [-k] PACKAGE_IDS
- or::
- python -m nltk.downloader [-d DATADIR] [-q] [-f] [-k] PACKAGE_IDS
- """
- # ----------------------------------------------------------------------
- """
- 0 1 2 3
- [label][----][label][----]
- [column ][column ]
- Notes
- =====
- Handling data files.. Some questions:
- * Should the data files be kept zipped or unzipped? I say zipped.
- * Should the data files be kept in svn at all? Advantages: history;
- automatic version numbers; 'svn up' could be used rather than the
- downloader to update the corpora. Disadvantages: they're big,
- which makes working from svn a bit of a pain. And we're planning
- to potentially make them much bigger. I don't think we want
- people to have to download 400MB corpora just to use nltk from svn.
- * Compromise: keep the data files in trunk/data rather than in
- trunk/nltk. That way you can check them out in svn if you want
- to; but you don't need to, and you can use the downloader instead.
- * Also: keep models in mind. When we change the code, we'd
- potentially like the models to get updated. This could require a
- little thought.
- * So.. let's assume we have a trunk/data directory, containing a bunch
- of packages. The packages should be kept as zip files, because we
- really shouldn't be editing them much (well -- we may edit models
- more, but they tend to be binary-ish files anyway, where diffs
- aren't that helpful). So we'll have trunk/data, with a bunch of
- files like abc.zip and treebank.zip and propbank.zip. For each
- package we could also have eg treebank.xml and propbank.xml,
- describing the contents of the package (name, copyright, license,
- etc). Collections would also have .xml files. Finally, we would
- pull all these together to form a single index.xml file. Some
- directory structure wouldn't hurt. So how about::
- /trunk/data/ ....................... root of data svn
- index.xml ........................ main index file
- src/ ............................. python scripts
- packages/ ........................ dir for packages
- corpora/ ....................... zip & xml files for corpora
- grammars/ ...................... zip & xml files for grammars
- taggers/ ....................... zip & xml files for taggers
- tokenizers/ .................... zip & xml files for tokenizers
- etc.
- collections/ ..................... xml files for collections
- Where the root (/trunk/data) would contain a makefile; and src/
- would contain a script to update the info.xml file. It could also
- contain scripts to rebuild some of the various model files. The
- script that builds index.xml should probably check that each zip
- file expands entirely into a single subdir, whose name matches the
- package's uid.
- Changes I need to make:
- - in index: change "size" to "filesize" or "compressed-size"
- - in index: add "unzipped-size"
- - when checking status: check both compressed & uncompressed size.
- uncompressed size is important to make sure we detect a problem
- if something got partially unzipped. define new status values
- to differentiate stale vs corrupt vs corruptly-uncompressed??
- (we shouldn't need to re-download the file if the zip file is ok
- but it didn't get uncompressed fully.)
- - add other fields to the index: author, license, copyright, contact,
- etc.
- the current grammars/ package would become a single new package (eg
- toy-grammars or book-grammars).
- xml file should have:
- - authorship info
- - license info
- - copyright info
- - contact info
- - info about what type of data/annotation it contains?
- - recommended corpus reader?
- collections can contain other collections. they can also contain
- multiple package types (corpora & models). Have a single 'basics'
- package that includes everything we talk about in the book?
- n.b.: there will have to be a fallback to the punkt tokenizer, in case
- they didn't download that model.
- default: unzip or not?
- """
- import time, os, zipfile, sys, textwrap, threading, itertools, shutil, functools
- import subprocess
- from hashlib import md5
- from xml.etree import ElementTree
- try:
- TKINTER = True
- from tkinter import (
- Tk,
- Frame,
- Label,
- Entry,
- Button,
- Canvas,
- Menu,
- IntVar,
- TclError,
- )
- from tkinter.messagebox import showerror
- from nltk.draw.table import Table
- from nltk.draw.util import ShowText
- except ImportError:
- TKINTER = False
- TclError = ValueError
- from urllib.request import urlopen
- from urllib.error import HTTPError, URLError
- import nltk
- # urllib2 = nltk.internals.import_from_stdlib('urllib2')
- ######################################################################
- # Directory entry objects (from the data server's index file)
- ######################################################################
- class Package(object):
- """
- A directory entry for a downloadable package. These entries are
- extracted from the XML index file that is downloaded by
- ``Downloader``. Each package consists of a single file; but if
- that file is a zip file, then it can be automatically decompressed
- when the package is installed.
- """
- def __init__(
- self,
- id,
- url,
- name=None,
- subdir="",
- size=None,
- unzipped_size=None,
- checksum=None,
- svn_revision=None,
- copyright="Unknown",
- contact="Unknown",
- license="Unknown",
- author="Unknown",
- unzip=True,
- **kw
- ):
- self.id = id
- """A unique identifier for this package."""
- self.name = name or id
- """A string name for this package."""
- self.subdir = subdir
- """The subdirectory where this package should be installed.
- E.g., ``'corpora'`` or ``'taggers'``."""
- self.url = url
- """A URL that can be used to download this package's file."""
- self.size = int(size)
- """The filesize (in bytes) of the package file."""
- self.unzipped_size = int(unzipped_size)
- """The total filesize of the files contained in the package's
- zipfile."""
- self.checksum = checksum
- """The MD-5 checksum of the package file."""
- self.svn_revision = svn_revision
- """A subversion revision number for this package."""
- self.copyright = copyright
- """Copyright holder for this package."""
- self.contact = contact
- """Name & email of the person who should be contacted with
- questions about this package."""
- self.license = license
- """License information for this package."""
- self.author = author
- """Author of this package."""
- ext = os.path.splitext(url.split("/")[-1])[1]
- self.filename = os.path.join(subdir, id + ext)
- """The filename that should be used for this package's file. It
- is formed by joining ``self.subdir`` with ``self.id``, and
- using the same extension as ``url``."""
- self.unzip = bool(int(unzip)) # '0' or '1'
- """A flag indicating whether this corpus should be unzipped by
- default."""
- # Include any other attributes provided by the XML file.
- self.__dict__.update(kw)
- @staticmethod
- def fromxml(xml):
- if isinstance(xml, str):
- xml = ElementTree.parse(xml)
- for key in xml.attrib:
- xml.attrib[key] = str(xml.attrib[key])
- return Package(**xml.attrib)
- def __lt__(self, other):
- return self.id < other.id
- def __repr__(self):
- return "<Package %s>" % self.id
- class Collection(object):
- """
- A directory entry for a collection of downloadable packages.
- These entries are extracted from the XML index file that is
- downloaded by ``Downloader``.
- """
- def __init__(self, id, children, name=None, **kw):
- self.id = id
- """A unique identifier for this collection."""
- self.name = name or id
- """A string name for this collection."""
- self.children = children
- """A list of the ``Collections`` or ``Packages`` directly
- contained by this collection."""
- self.packages = None
- """A list of ``Packages`` contained by this collection or any
- collections it recursively contains."""
- # Include any other attributes provided by the XML file.
- self.__dict__.update(kw)
- @staticmethod
- def fromxml(xml):
- if isinstance(xml, str):
- xml = ElementTree.parse(xml)
- for key in xml.attrib:
- xml.attrib[key] = str(xml.attrib[key])
- children = [child.get("ref") for child in xml.findall("item")]
- return Collection(children=children, **xml.attrib)
- def __lt__(self, other):
- return self.id < other.id
- def __repr__(self):
- return "<Collection %s>" % self.id
- ######################################################################
- # Message Passing Objects
- ######################################################################
- class DownloaderMessage(object):
- """A status message object, used by ``incr_download`` to
- communicate its progress."""
- class StartCollectionMessage(DownloaderMessage):
- """Data server has started working on a collection of packages."""
- def __init__(self, collection):
- self.collection = collection
- class FinishCollectionMessage(DownloaderMessage):
- """Data server has finished working on a collection of packages."""
- def __init__(self, collection):
- self.collection = collection
- class StartPackageMessage(DownloaderMessage):
- """Data server has started working on a package."""
- def __init__(self, package):
- self.package = package
- class FinishPackageMessage(DownloaderMessage):
- """Data server has finished working on a package."""
- def __init__(self, package):
- self.package = package
- class StartDownloadMessage(DownloaderMessage):
- """Data server has started downloading a package."""
- def __init__(self, package):
- self.package = package
- class FinishDownloadMessage(DownloaderMessage):
- """Data server has finished downloading a package."""
- def __init__(self, package):
- self.package = package
- class StartUnzipMessage(DownloaderMessage):
- """Data server has started unzipping a package."""
- def __init__(self, package):
- self.package = package
- class FinishUnzipMessage(DownloaderMessage):
- """Data server has finished unzipping a package."""
- def __init__(self, package):
- self.package = package
- class UpToDateMessage(DownloaderMessage):
- """The package download file is already up-to-date"""
- def __init__(self, package):
- self.package = package
- class StaleMessage(DownloaderMessage):
- """The package download file is out-of-date or corrupt"""
- def __init__(self, package):
- self.package = package
- class ErrorMessage(DownloaderMessage):
- """Data server encountered an error"""
- def __init__(self, package, message):
- self.package = package
- if isinstance(message, Exception):
- self.message = str(message)
- else:
- self.message = message
- class ProgressMessage(DownloaderMessage):
- """Indicates how much progress the data server has made"""
- def __init__(self, progress):
- self.progress = progress
- class SelectDownloadDirMessage(DownloaderMessage):
- """Indicates what download directory the data server is using"""
- def __init__(self, download_dir):
- self.download_dir = download_dir
- ######################################################################
- # NLTK Data Server
- ######################################################################
- class Downloader(object):
- """
- A class used to access the NLTK data server, which can be used to
- download corpora and other data packages.
- """
- # /////////////////////////////////////////////////////////////////
- # Configuration
- # /////////////////////////////////////////////////////////////////
- INDEX_TIMEOUT = 60 * 60 # 1 hour
- """The amount of time after which the cached copy of the data
- server index will be considered 'stale,' and will be
- re-downloaded."""
- DEFAULT_URL = "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml"
- """The default URL for the NLTK data server's index. An
- alternative URL can be specified when creating a new
- ``Downloader`` object."""
- # /////////////////////////////////////////////////////////////////
- # Status Constants
- # /////////////////////////////////////////////////////////////////
- INSTALLED = "installed"
- """A status string indicating that a package or collection is
- installed and up-to-date."""
- NOT_INSTALLED = "not installed"
- """A status string indicating that a package or collection is
- not installed."""
- STALE = "out of date"
- """A status string indicating that a package or collection is
- corrupt or out-of-date."""
- PARTIAL = "partial"
- """A status string indicating that a collection is partially
- installed (i.e., only some of its packages are installed.)"""
- # /////////////////////////////////////////////////////////////////
- # Cosntructor
- # /////////////////////////////////////////////////////////////////
- def __init__(self, server_index_url=None, download_dir=None):
- self._url = server_index_url or self.DEFAULT_URL
- """The URL for the data server's index file."""
- self._collections = {}
- """Dictionary from collection identifier to ``Collection``"""
- self._packages = {}
- """Dictionary from package identifier to ``Package``"""
- self._download_dir = download_dir
- """The default directory to which packages will be downloaded."""
- self._index = None
- """The XML index file downloaded from the data server"""
- self._index_timestamp = None
- """Time at which ``self._index`` was downloaded. If it is more
- than ``INDEX_TIMEOUT`` seconds old, it will be re-downloaded."""
- self._status_cache = {}
- """Dictionary from package/collection identifier to status
- string (``INSTALLED``, ``NOT_INSTALLED``, ``STALE``, or
- ``PARTIAL``). Cache is used for packages only, not
- collections."""
- self._errors = None
- """Flag for telling if all packages got successfully downloaded or not."""
- # decide where we're going to save things to.
- if self._download_dir is None:
- self._download_dir = self.default_download_dir()
- # /////////////////////////////////////////////////////////////////
- # Information
- # /////////////////////////////////////////////////////////////////
- def list(
- self,
- download_dir=None,
- show_packages=True,
- show_collections=True,
- header=True,
- more_prompt=False,
- skip_installed=False,
- ):
- lines = 0 # for more_prompt
- if download_dir is None:
- download_dir = self._download_dir
- print("Using default data directory (%s)" % download_dir)
- if header:
- print("=" * (26 + len(self._url)))
- print(" Data server index for <%s>" % self._url)
- print("=" * (26 + len(self._url)))
- lines += 3 # for more_prompt
- stale = partial = False
- categories = []
- if show_packages:
- categories.append("packages")
- if show_collections:
- categories.append("collections")
- for category in categories:
- print("%s:" % category.capitalize())
- lines += 1 # for more_prompt
- for info in sorted(getattr(self, category)(), key=str):
- status = self.status(info, download_dir)
- if status == self.INSTALLED and skip_installed:
- continue
- if status == self.STALE:
- stale = True
- if status == self.PARTIAL:
- partial = True
- prefix = {
- self.INSTALLED: "*",
- self.STALE: "-",
- self.PARTIAL: "P",
- self.NOT_INSTALLED: " ",
- }[status]
- name = textwrap.fill(
- "-" * 27 + (info.name or info.id), 75, subsequent_indent=27 * " "
- )[27:]
- print(" [%s] %s %s" % (prefix, info.id.ljust(20, "."), name))
- lines += len(name.split("\n")) # for more_prompt
- if more_prompt and lines > 20:
- user_input = input("Hit Enter to continue: ")
- if user_input.lower() in ("x", "q"):
- return
- lines = 0
- print()
- msg = "([*] marks installed packages"
- if stale:
- msg += "; [-] marks out-of-date or corrupt packages"
- if partial:
- msg += "; [P] marks partially installed collections"
- print(textwrap.fill(msg + ")", subsequent_indent=" ", width=76))
- def packages(self):
- self._update_index()
- return self._packages.values()
- def corpora(self):
- self._update_index()
- return [pkg for (id, pkg) in self._packages.items() if pkg.subdir == "corpora"]
- def models(self):
- self._update_index()
- return [pkg for (id, pkg) in self._packages.items() if pkg.subdir != "corpora"]
- def collections(self):
- self._update_index()
- return self._collections.values()
- # /////////////////////////////////////////////////////////////////
- # Downloading
- # /////////////////////////////////////////////////////////////////
- def _info_or_id(self, info_or_id):
- if isinstance(info_or_id, str):
- return self.info(info_or_id)
- else:
- return info_or_id
- # [xx] When during downloading is it 'safe' to abort? Only unsafe
- # time is *during* an unzip -- we don't want to leave a
- # partially-unzipped corpus in place because we wouldn't notice
- # it. But if we had the exact total size of the unzipped corpus,
- # then that would be fine. Then we could abort anytime we want!
- # So this is really what we should do. That way the threaded
- # downloader in the gui can just kill the download thread anytime
- # it wants.
- def incr_download(self, info_or_id, download_dir=None, force=False):
- # If they didn't specify a download_dir, then use the default one.
- if download_dir is None:
- download_dir = self._download_dir
- yield SelectDownloadDirMessage(download_dir)
- # If they gave us a list of ids, then download each one.
- if isinstance(info_or_id, (list, tuple)):
- for msg in self._download_list(info_or_id, download_dir, force):
- yield msg
- return
- # Look up the requested collection or package.
- try:
- info = self._info_or_id(info_or_id)
- except (IOError, ValueError) as e:
- yield ErrorMessage(None, "Error loading %s: %s" % (info_or_id, e))
- return
- # Handle collections.
- if isinstance(info, Collection):
- yield StartCollectionMessage(info)
- for msg in self.incr_download(info.children, download_dir, force):
- yield msg
- yield FinishCollectionMessage(info)
- # Handle Packages (delegate to a helper function).
- else:
- for msg in self._download_package(info, download_dir, force):
- yield msg
- def _num_packages(self, item):
- if isinstance(item, Package):
- return 1
- else:
- return len(item.packages)
- def _download_list(self, items, download_dir, force):
- # Look up the requested items.
- for i in range(len(items)):
- try:
- items[i] = self._info_or_id(items[i])
- except (IOError, ValueError) as e:
- yield ErrorMessage(items[i], e)
- return
- # Download each item, re-scaling their progress.
- num_packages = sum(self._num_packages(item) for item in items)
- progress = 0
- for i, item in enumerate(items):
- if isinstance(item, Package):
- delta = 1.0 / num_packages
- else:
- delta = len(item.packages) / num_packages
- for msg in self.incr_download(item, download_dir, force):
- if isinstance(msg, ProgressMessage):
- yield ProgressMessage(progress + msg.progress * delta)
- else:
- yield msg
- progress += 100 * delta
- def _download_package(self, info, download_dir, force):
- yield StartPackageMessage(info)
- yield ProgressMessage(0)
- # Do we already have the current version?
- status = self.status(info, download_dir)
- if not force and status == self.INSTALLED:
- yield UpToDateMessage(info)
- yield ProgressMessage(100)
- yield FinishPackageMessage(info)
- return
- # Remove the package from our status cache
- self._status_cache.pop(info.id, None)
- # Check for (and remove) any old/stale version.
- filepath = os.path.join(download_dir, info.filename)
- if os.path.exists(filepath):
- if status == self.STALE:
- yield StaleMessage(info)
- os.remove(filepath)
- # Ensure the download_dir exists
- if not os.path.exists(download_dir):
- os.mkdir(download_dir)
- if not os.path.exists(os.path.join(download_dir, info.subdir)):
- os.mkdir(os.path.join(download_dir, info.subdir))
- # Download the file. This will raise an IOError if the url
- # is not found.
- yield StartDownloadMessage(info)
- yield ProgressMessage(5)
- try:
- infile = urlopen(info.url)
- with open(filepath, "wb") as outfile:
- num_blocks = max(1, info.size / (1024 * 16))
- for block in itertools.count():
- s = infile.read(1024 * 16) # 16k blocks.
- outfile.write(s)
- if not s:
- break
- if block % 2 == 0: # how often?
- yield ProgressMessage(min(80, 5 + 75 * (block / num_blocks)))
- infile.close()
- except IOError as e:
- yield ErrorMessage(
- info,
- "Error downloading %r from <%s>:" "\n %s" % (info.id, info.url, e),
- )
- return
- yield FinishDownloadMessage(info)
- yield ProgressMessage(80)
- # If it's a zipfile, uncompress it.
- if info.filename.endswith(".zip"):
- zipdir = os.path.join(download_dir, info.subdir)
- # Unzip if we're unzipping by default; *or* if it's already
- # been unzipped (presumably a previous version).
- if info.unzip or os.path.exists(os.path.join(zipdir, info.id)):
- yield StartUnzipMessage(info)
- for msg in _unzip_iter(filepath, zipdir, verbose=False):
- # Somewhat of a hack, but we need a proper package reference
- msg.package = info
- yield msg
- yield FinishUnzipMessage(info)
- yield FinishPackageMessage(info)
- def download(
- self,
- info_or_id=None,
- download_dir=None,
- quiet=False,
- force=False,
- prefix="[nltk_data] ",
- halt_on_error=True,
- raise_on_error=False,
- print_error_to=sys.stderr,
- ):
- print_to = functools.partial(print, file=print_error_to)
- # If no info or id is given, then use the interactive shell.
- if info_or_id is None:
- # [xx] hmm -- changing self._download_dir here seems like
- # the wrong thing to do. Maybe the _interactive_download
- # function should make a new copy of self to use?
- if download_dir is not None:
- self._download_dir = download_dir
- self._interactive_download()
- return True
- else:
- # Define a helper function for displaying output:
- def show(s, prefix2=""):
- print_to(
- textwrap.fill(
- s,
- initial_indent=prefix + prefix2,
- subsequent_indent=prefix + prefix2 + " " * 4,
- )
- )
- for msg in self.incr_download(info_or_id, download_dir, force):
- # Error messages
- if isinstance(msg, ErrorMessage):
- show(msg.message)
- if raise_on_error:
- raise ValueError(msg.message)
- if halt_on_error:
- return False
- self._errors = True
- if not quiet:
- print_to("Error installing package. Retry? [n/y/e]")
- choice = input().strip()
- if choice in ["y", "Y"]:
- if not self.download(
- msg.package.id,
- download_dir,
- quiet,
- force,
- prefix,
- halt_on_error,
- raise_on_error,
- ):
- return False
- elif choice in ["e", "E"]:
- return False
- # All other messages
- if not quiet:
- # Collection downloading messages:
- if isinstance(msg, StartCollectionMessage):
- show("Downloading collection %r" % msg.collection.id)
- prefix += " | "
- print_to(prefix)
- elif isinstance(msg, FinishCollectionMessage):
- print_to(prefix)
- prefix = prefix[:-4]
- if self._errors:
- show(
- "Downloaded collection %r with errors"
- % msg.collection.id
- )
- else:
- show("Done downloading collection %s" % msg.collection.id)
- # Package downloading messages:
- elif isinstance(msg, StartPackageMessage):
- show(
- "Downloading package %s to %s..."
- % (msg.package.id, download_dir)
- )
- elif isinstance(msg, UpToDateMessage):
- show("Package %s is already up-to-date!" % msg.package.id, " ")
- # elif isinstance(msg, StaleMessage):
- # show('Package %s is out-of-date or corrupt' %
- # msg.package.id, ' ')
- elif isinstance(msg, StartUnzipMessage):
- show("Unzipping %s." % msg.package.filename, " ")
- # Data directory message:
- elif isinstance(msg, SelectDownloadDirMessage):
- download_dir = msg.download_dir
- return True
- def is_stale(self, info_or_id, download_dir=None):
- return self.status(info_or_id, download_dir) == self.STALE
- def is_installed(self, info_or_id, download_dir=None):
- return self.status(info_or_id, download_dir) == self.INSTALLED
- def clear_status_cache(self, id=None):
- if id is None:
- self._status_cache.clear()
- else:
- self._status_cache.pop(id, None)
- def status(self, info_or_id, download_dir=None):
- """
- Return a constant describing the status of the given package
- or collection. Status can be one of ``INSTALLED``,
- ``NOT_INSTALLED``, ``STALE``, or ``PARTIAL``.
- """
- if download_dir is None:
- download_dir = self._download_dir
- info = self._info_or_id(info_or_id)
- # Handle collections:
- if isinstance(info, Collection):
- pkg_status = [self.status(pkg.id) for pkg in info.packages]
- if self.STALE in pkg_status:
- return self.STALE
- elif self.PARTIAL in pkg_status:
- return self.PARTIAL
- elif self.INSTALLED in pkg_status and self.NOT_INSTALLED in pkg_status:
- return self.PARTIAL
- elif self.NOT_INSTALLED in pkg_status:
- return self.NOT_INSTALLED
- else:
- return self.INSTALLED
- # Handle packages:
- else:
- filepath = os.path.join(download_dir, info.filename)
- if download_dir != self._download_dir:
- return self._pkg_status(info, filepath)
- else:
- if info.id not in self._status_cache:
- self._status_cache[info.id] = self._pkg_status(info, filepath)
- return self._status_cache[info.id]
- def _pkg_status(self, info, filepath):
- if not os.path.exists(filepath):
- return self.NOT_INSTALLED
- # Check if the file has the correct size.
- try:
- filestat = os.stat(filepath)
- except OSError:
- return self.NOT_INSTALLED
- if filestat.st_size != int(info.size):
- return self.STALE
- # Check if the file's checksum matches
- if md5_hexdigest(filepath) != info.checksum:
- return self.STALE
- # If it's a zipfile, and it's been at least partially
- # unzipped, then check if it's been fully unzipped.
- if filepath.endswith(".zip"):
- unzipdir = filepath[:-4]
- if not os.path.exists(unzipdir):
- return self.INSTALLED # but not unzipped -- ok!
- if not os.path.isdir(unzipdir):
- return self.STALE
- unzipped_size = sum(
- os.stat(os.path.join(d, f)).st_size
- for d, _, files in os.walk(unzipdir)
- for f in files
- )
- if unzipped_size != info.unzipped_size:
- return self.STALE
- # Otherwise, everything looks good.
- return self.INSTALLED
- def update(self, quiet=False, prefix="[nltk_data] "):
- """
- Re-download any packages whose status is STALE.
- """
- self.clear_status_cache()
- for pkg in self.packages():
- if self.status(pkg) == self.STALE:
- self.download(pkg, quiet=quiet, prefix=prefix)
- # /////////////////////////////////////////////////////////////////
- # Index
- # /////////////////////////////////////////////////////////////////
- def _update_index(self, url=None):
- """A helper function that ensures that self._index is
- up-to-date. If the index is older than self.INDEX_TIMEOUT,
- then download it again."""
- # Check if the index is aleady up-to-date. If so, do nothing.
- if not (
- self._index is None
- or url is not None
- or time.time() - self._index_timestamp > self.INDEX_TIMEOUT
- ):
- return
- # If a URL was specified, then update our URL.
- self._url = url or self._url
- # Download the index file.
- self._index = nltk.internals.ElementWrapper(
- ElementTree.parse(urlopen(self._url)).getroot()
- )
- self._index_timestamp = time.time()
- # Build a dictionary of packages.
- packages = [Package.fromxml(p) for p in self._index.findall("packages/package")]
- self._packages = dict((p.id, p) for p in packages)
- # Build a dictionary of collections.
- collections = [
- Collection.fromxml(c) for c in self._index.findall("collections/collection")
- ]
- self._collections = dict((c.id, c) for c in collections)
- # Replace identifiers with actual children in collection.children.
- for collection in self._collections.values():
- for i, child_id in enumerate(collection.children):
- if child_id in self._packages:
- collection.children[i] = self._packages[child_id]
- elif child_id in self._collections:
- collection.children[i] = self._collections[child_id]
- else:
- print(
- "removing collection member with no package: {}".format(
- child_id
- )
- )
- del collection.children[i]
- # Fill in collection.packages for each collection.
- for collection in self._collections.values():
- packages = {}
- queue = [collection]
- for child in queue:
- if isinstance(child, Collection):
- queue.extend(child.children)
- elif isinstance(child, Package):
- packages[child.id] = child
- else:
- pass
- collection.packages = packages.values()
- # Flush the status cache
- self._status_cache.clear()
- def index(self):
- """
- Return the XML index describing the packages available from
- the data server. If necessary, this index will be downloaded
- from the data server.
- """
- self._update_index()
- return self._index
- def info(self, id):
- """Return the ``Package`` or ``Collection`` record for the
- given item."""
- self._update_index()
- if id in self._packages:
- return self._packages[id]
- if id in self._collections:
- return self._collections[id]
- raise ValueError("Package %r not found in index" % id)
- def xmlinfo(self, id):
- """Return the XML info record for the given item"""
- self._update_index()
- for package in self._index.findall("packages/package"):
- if package.get("id") == id:
- return package
- for collection in self._index.findall("collections/collection"):
- if collection.get("id") == id:
- return collection
- raise ValueError("Package %r not found in index" % id)
- # /////////////////////////////////////////////////////////////////
- # URL & Data Directory
- # /////////////////////////////////////////////////////////////////
- def _get_url(self):
- """The URL for the data server's index file."""
- return self._url
- def _set_url(self, url):
- """
- Set a new URL for the data server. If we're unable to contact
- the given url, then the original url is kept.
- """
- original_url = self._url
- try:
- self._update_index(url)
- except:
- self._url = original_url
- raise
- url = property(_get_url, _set_url)
- def default_download_dir(self):
- """
- Return the directory to which packages will be downloaded by
- default. This value can be overridden using the constructor,
- or on a case-by-case basis using the ``download_dir`` argument when
- calling ``download()``.
- On Windows, the default download directory is
- ``PYTHONHOME/lib/nltk``, where *PYTHONHOME* is the
- directory containing Python, e.g. ``C:\\Python25``.
- On all other platforms, the default directory is the first of
- the following which exists or which can be created with write
- permission: ``/usr/share/nltk_data``, ``/usr/local/share/nltk_data``,
- ``/usr/lib/nltk_data``, ``/usr/local/lib/nltk_data``, ``~/nltk_data``.
- """
- # Check if we are on GAE where we cannot write into filesystem.
- if "APPENGINE_RUNTIME" in os.environ:
- return
- # Check if we have sufficient permissions to install in a
- # variety of system-wide locations.
- for nltkdir in nltk.data.path:
- if os.path.exists(nltkdir) and nltk.internals.is_writable(nltkdir):
- return nltkdir
- # On Windows, use %APPDATA%
- if sys.platform == "win32" and "APPDATA" in os.environ:
- homedir = os.environ["APPDATA"]
- # Otherwise, install in the user's home directory.
- else:
- homedir = os.path.expanduser("~/")
- if homedir == "~/":
- raise ValueError("Could not find a default download directory")
- # append "nltk_data" to the home directory
- return os.path.join(homedir, "nltk_data")
- def _get_download_dir(self):
- """
- The default directory to which packages will be downloaded.
- This defaults to the value returned by ``default_download_dir()``.
- To override this default on a case-by-case basis, use the
- ``download_dir`` argument when calling ``download()``.
- """
- return self._download_dir
- def _set_download_dir(self, download_dir):
- self._download_dir = download_dir
- # Clear the status cache.
- self._status_cache.clear()
- download_dir = property(_get_download_dir, _set_download_dir)
- # /////////////////////////////////////////////////////////////////
- # Interactive Shell
- # /////////////////////////////////////////////////////////////////
- def _interactive_download(self):
- # Try the GUI first; if that doesn't work, try the simple
- # interactive shell.
- if TKINTER:
- try:
- DownloaderGUI(self).mainloop()
- except TclError:
- DownloaderShell(self).run()
- else:
- DownloaderShell(self).run()
- class DownloaderShell(object):
- def __init__(self, dataserver):
- self._ds = dataserver
- def _simple_interactive_menu(self, *options):
- print("-" * 75)
- spc = (68 - sum(len(o) for o in options)) // (len(options) - 1) * " "
- print(" " + spc.join(options))
- print("-" * 75)
- def run(self):
- print("NLTK Downloader")
- while True:
- self._simple_interactive_menu(
- "d) Download",
- "l) List",
- " u) Update",
- "c) Config",
- "h) Help",
- "q) Quit",
- )
- user_input = input("Downloader> ").strip()
- if not user_input:
- print()
- continue
- command = user_input.lower().split()[0]
- args = user_input.split()[1:]
- try:
- if command == "l":
- print()
- self._ds.list(self._ds.download_dir, header=False, more_prompt=True)
- elif command == "h":
- self._simple_interactive_help()
- elif command == "c":
- self._simple_interactive_config()
- elif command in ("q", "x"):
- return
- elif command == "d":
- self._simple_interactive_download(args)
- elif command == "u":
- self._simple_interactive_update()
- else:
- print("Command %r unrecognized" % user_input)
- except HTTPError as e:
- print("Error reading from server: %s" % e)
- except URLError as e:
- print("Error connecting to server: %s" % e.reason)
- # try checking if user_input is a package name, &
- # downloading it?
- print()
- def _simple_interactive_download(self, args):
- if args:
- for arg in args:
- try:
- self._ds.download(arg, prefix=" ")
- except (IOError, ValueError) as e:
- print(e)
- else:
- while True:
- print()
- print("Download which package (l=list; x=cancel)?")
- user_input = input(" Identifier> ")
- if user_input.lower() == "l":
- self._ds.list(
- self._ds.download_dir,
- header=False,
- more_prompt=True,
- skip_installed=True,
- )
- continue
- elif user_input.lower() in ("x", "q", ""):
- return
- elif user_input:
- for id in user_input.split():
- try:
- self._ds.download(id, prefix=" ")
- except (IOError, ValueError) as e:
- print(e)
- break
- def _simple_interactive_update(self):
- while True:
- stale_packages = []
- stale = partial = False
- for info in sorted(getattr(self._ds, "packages")(), key=str):
- if self._ds.status(info) == self._ds.STALE:
- stale_packages.append((info.id, info.name))
- print()
- if stale_packages:
- print("Will update following packages (o=ok; x=cancel)")
- for pid, pname in stale_packages:
- name = textwrap.fill(
- "-" * 27 + (pname), 75, subsequent_indent=27 * " "
- )[27:]
- print(" [ ] %s %s" % (pid.ljust(20, "."), name))
- print()
- user_input = input(" Identifier> ")
- if user_input.lower() == "o":
- for pid, pname in stale_packages:
- try:
- self._ds.download(pid, prefix=" ")
- except (IOError, ValueError) as e:
- print(e)
- break
- elif user_input.lower() in ("x", "q", ""):
- return
- else:
- print("Nothing to update.")
- return
- def _simple_interactive_help(self):
- print()
- print("Commands:")
- print(
- " d) Download a package or collection u) Update out of date packages"
- )
- print(" l) List packages & collections h) Help")
- print(" c) View & Modify Configuration q) Quit")
- def _show_config(self):
- print()
- print("Data Server:")
- print(" - URL: <%s>" % self._ds.url)
- print((" - %d Package Collections Available" % len(self._ds.collections())))
- print((" - %d Individual Packages Available" % len(self._ds.packages())))
- print()
- print("Local Machine:")
- print(" - Data directory: %s" % self._ds.download_dir)
- def _simple_interactive_config(self):
- self._show_config()
- while True:
- print()
- self._simple_interactive_menu(
- "s) Show Config", "u) Set Server URL", "d) Set Data Dir", "m) Main Menu"
- )
- user_input = input("Config> ").strip().lower()
- if user_input == "s":
- self._show_config()
- elif user_input == "d":
- new_dl_dir = input(" New Directory> ").strip()
- if new_dl_dir in ("", "x", "q", "X", "Q"):
- print(" Cancelled!")
- elif os.path.isdir(new_dl_dir):
- self._ds.download_dir = new_dl_dir
- else:
- print(("Directory %r not found! Create it first." % new_dl_dir))
- elif user_input == "u":
- new_url = input(" New URL> ").strip()
- if new_url in ("", "x", "q", "X", "Q"):
- print(" Cancelled!")
- else:
- if not new_url.startswith(("http://", "https://")):
- new_url = "http://" + new_url
- try:
- self._ds.url = new_url
- except Exception as e:
- print("Error reading <%r>:\n %s" % (new_url, e))
- elif user_input == "m":
- break
- class DownloaderGUI(object):
- """
- Graphical interface for downloading packages from the NLTK data
- server.
- """
- # /////////////////////////////////////////////////////////////////
- # Column Configuration
- # /////////////////////////////////////////////////////////////////
- COLUMNS = [
- "",
- "Identifier",
- "Name",
- "Size",
- "Status",
- "Unzipped Size",
- "Copyright",
- "Contact",
- "License",
- "Author",
- "Subdir",
- "Checksum",
- ]
- """A list of the names of columns. This controls the order in
- which the columns will appear. If this is edited, then
- ``_package_to_columns()`` may need to be edited to match."""
- COLUMN_WEIGHTS = {"": 0, "Name": 5, "Size": 0, "Status": 0}
- """A dictionary specifying how columns should be resized when the
- table is resized. Columns with weight 0 will not be resized at
- all; and columns with high weight will be resized more.
- Default weight (for columns not explicitly listed) is 1."""
- COLUMN_WIDTHS = {
- "": 1,
- "Identifier": 20,
- "Name": 45,
- "Size": 10,
- "Unzipped Size": 10,
- "Status": 12,
- }
- """A dictionary specifying how wide each column should be, in
- characters. The default width (for columns not explicitly
- listed) is specified by ``DEFAULT_COLUMN_WIDTH``."""
- DEFAULT_COLUMN_WIDTH = 30
- """The default width for columns that are not explicitly listed
- in ``COLUMN_WIDTHS``."""
- INITIAL_COLUMNS = ["", "Identifier", "Name", "Size", "Status"]
- """The set of columns that should be displayed by default."""
- # Perform a few import-time sanity checks to make sure that the
- # column configuration variables are defined consistently:
- for c in COLUMN_WEIGHTS:
- assert c in COLUMNS
- for c in COLUMN_WIDTHS:
- assert c in COLUMNS
- for c in INITIAL_COLUMNS:
- assert c in COLUMNS
- # /////////////////////////////////////////////////////////////////
- # Color Configuration
- # /////////////////////////////////////////////////////////////////
- _BACKDROP_COLOR = ("#000", "#ccc")
- _ROW_COLOR = {
- Downloader.INSTALLED: ("#afa", "#080"),
- Downloader.PARTIAL: ("#ffa", "#880"),
- Downloader.STALE: ("#faa", "#800"),
- Downloader.NOT_INSTALLED: ("#fff", "#888"),
- }
- _MARK_COLOR = ("#000", "#ccc")
- # _FRONT_TAB_COLOR = ('#ccf', '#008')
- # _BACK_TAB_COLOR = ('#88a', '#448')
- _FRONT_TAB_COLOR = ("#fff", "#45c")
- _BACK_TAB_COLOR = ("#aaa", "#67a")
- _PROGRESS_COLOR = ("#f00", "#aaa")
- _TAB_FONT = "helvetica -16 bold"
- # /////////////////////////////////////////////////////////////////
- # Constructor
- # /////////////////////////////////////////////////////////////////
- def __init__(self, dataserver, use_threads=True):
- self._ds = dataserver
- self._use_threads = use_threads
- # For the threaded downloader:
- self._download_lock = threading.Lock()
- self._download_msg_queue = []
- self._download_abort_queue = []
- self._downloading = False
- # For tkinter after callbacks:
- self._afterid = {}
- # A message log.
- self._log_messages = []
- self._log_indent = 0
- self._log("NLTK Downloader Started!")
- # Create the main window.
- top = self.top = Tk()
- top.geometry("+50+50")
- top.title("NLTK Downloader")
- top.configure(background=self._BACKDROP_COLOR[1])
- # Set up some bindings now, in case anything goes wrong.
- top.bind("<Control-q>", self.destroy)
- top.bind("<Control-x>", self.destroy)
- self._destroyed = False
- self._column_vars = {}
- # Initialize the GUI.
- self._init_widgets()
- self._init_menu()
- try:
- self._fill_table()
- except HTTPError as e:
- showerror("Error reading from server", e)
- except URLError as e:
- showerror("Error connecting to server", e.reason)
- self._show_info()
- self._select_columns()
- self._table.select(0)
- # Make sure we get notified when we're destroyed, so we can
- # cancel any download in progress.
- self._table.bind("<Destroy>", self._destroy)
- def _log(self, msg):
- self._log_messages.append(
- "%s %s%s" % (time.ctime(), " | " * self._log_indent, msg)
- )
- # /////////////////////////////////////////////////////////////////
- # Internals
- # /////////////////////////////////////////////////////////////////
- def _init_widgets(self):
- # Create the top-level frame structures
- f1 = Frame(self.top, relief="raised", border=2, padx=8, pady=0)
- f1.pack(sid="top", expand=True, fill="both")
- f1.grid_rowconfigure(2, weight=1)
- f1.grid_columnconfigure(0, weight=1)
- Frame(f1, height=8).grid(column=0, row=0) # spacer
- tabframe = Frame(f1)
- tabframe.grid(column=0, row=1, sticky="news")
- tableframe = Frame(f1)
- tableframe.grid(column=0, row=2, sticky="news")
- buttonframe = Frame(f1)
- buttonframe.grid(column=0, row=3, sticky="news")
- Frame(f1, height=8).grid(column=0, row=4) # spacer
- infoframe = Frame(f1)
- infoframe.grid(column=0, row=5, sticky="news")
- Frame(f1, height=8).grid(column=0, row=6) # spacer
- progressframe = Frame(
- self.top, padx=3, pady=3, background=self._BACKDROP_COLOR[1]
- )
- progressframe.pack(side="bottom", fill="x")
- self.top["border"] = 0
- self.top["highlightthickness"] = 0
- # Create the tabs
- self._tab_names = ["Collections", "Corpora", "Models", "All Packages"]
- self._tabs = {}
- for i, tab in enumerate(self._tab_names):
- label = Label(tabframe, text=tab, font=self._TAB_FONT)
- label.pack(side="left", padx=((i + 1) % 2) * 10)
- label.bind("<Button-1>", self._select_tab)
- self._tabs[tab.lower()] = label
- # Create the table.
- column_weights = [self.COLUMN_WEIGHTS.get(column, 1) for column in self.COLUMNS]
- self._table = Table(
- tableframe,
- self.COLUMNS,
- column_weights=column_weights,
- highlightthickness=0,
- listbox_height=16,
- reprfunc=self._table_reprfunc,
- )
- self._table.columnconfig(0, foreground=self._MARK_COLOR[0]) # marked
- for i, column in enumerate(self.COLUMNS):
- width = self.COLUMN_WIDTHS.get(column, self.DEFAULT_COLUMN_WIDTH)
- self._table.columnconfig(i, width=width)
- self._table.pack(expand=True, fill="both")
- self._table.focus()
- self._table.bind_to_listboxes("<Double-Button-1>", self._download)
- self._table.bind("<space>", self._table_mark)
- self._table.bind("<Return>", self._download)
- self._table.bind("<Left>", self._prev_tab)
- self._table.bind("<Right>", self._next_tab)
- self._table.bind("<Control-a>", self._mark_all)
- # Create entry boxes for URL & download_dir
- infoframe.grid_columnconfigure(1, weight=1)
- info = [
- ("url", "Server Index:", self._set_url),
- ("download_dir", "Download Directory:", self._set_download_dir),
- ]
- self._info = {}
- for (i, (key, label, callback)) in enumerate(info):
- Label(infoframe, text=label).grid(column=0, row=i, sticky="e")
- entry = Entry(
- infoframe, font="courier", relief="groove", disabledforeground="black"
- )
- self._info[key] = (entry, callback)
- entry.bind("<Return>", self._info_save)
- entry.bind("<Button-1>", lambda e, key=key: self._info_edit(key))
- entry.grid(column=1, row=i, sticky="ew")
- # If the user edits url or download_dir, and then clicks outside
- # the entry box, then save their results.
- self.top.bind("<Button-1>", self._info_save)
- # Create Download & Refresh buttons.
- self._download_button = Button(
- buttonframe, text="Download", command=self._download, width=8
- )
- self._download_button.pack(side="left")
- self._refresh_button = Button(
- buttonframe, text="Refresh", command=self._refresh, width=8
- )
- self._refresh_button.pack(side="right")
- # Create Progress bar
- self._progresslabel = Label(
- progressframe,
- text="",
- foreground=self._BACKDROP_COLOR[0],
- background=self._BACKDROP_COLOR[1],
- )
- self._progressbar = Canvas(
- progressframe,
- width=200,
- height=16,
- background=self._PROGRESS_COLOR[1],
- relief="sunken",
- border=1,
- )
- self._init_progressbar()
- self._progressbar.pack(side="right")
- self._progresslabel.pack(side="left")
- def _init_menu(self):
- menubar = Menu(self.top)
- filemenu = Menu(menubar, tearoff=0)
- filemenu.add_command(
- label="Download", underline=0, command=self._download, accelerator="Return"
- )
- filemenu.add_separator()
- filemenu.add_command(
- label="Change Server Index",
- underline=7,
- command=lambda: self._info_edit("url"),
- )
- filemenu.add_command(
- label="Change Download Directory",
- underline=0,
- command=lambda: self._info_edit("download_dir"),
- )
- filemenu.add_separator()
- filemenu.add_command(label="Show Log", underline=5, command=self._show_log)
- filemenu.add_separator()
- filemenu.add_command(
- label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-x"
- )
- menubar.add_cascade(label="File", underline=0, menu=filemenu)
- # Create a menu to control which columns of the table are
- # shown. n.b.: we never hide the first two columns (mark and
- # identifier).
- viewmenu = Menu(menubar, tearoff=0)
- for column in self._table.column_names[2:]:
- var = IntVar(self.top)
- assert column not in self._column_vars
- self._column_vars[column] = var
- if column in self.INITIAL_COLUMNS:
- var.set(1)
- viewmenu.add_checkbutton(
- label=column, underline=0, variable=var, command=self._select_columns
- )
- menubar.add_cascade(label="View", underline=0, menu=viewmenu)
- # Create a sort menu
- # [xx] this should be selectbuttons; and it should include
- # reversed sorts as options.
- sortmenu = Menu(menubar, tearoff=0)
- for column in self._table.column_names[1:]:
- sortmenu.add_command(
- label="Sort by %s" % column,
- command=(lambda c=column: self._table.sort_by(c, "ascending")),
- )
- sortmenu.add_separator()
- # sortmenu.add_command(label='Descending Sort:')
- for column in self._table.column_names[1:]:
- sortmenu.add_command(
- label="Reverse sort by %s" % column,
- command=(lambda c=column: self._table.sort_by(c, "descending")),
- )
- menubar.add_cascade(label="Sort", underline=0, menu=sortmenu)
- helpmenu = Menu(menubar, tearoff=0)
- helpmenu.add_command(label="About", underline=0, command=self.about)
- helpmenu.add_command(
- label="Instructions", underline=0, command=self.help, accelerator="F1"
- )
- menubar.add_cascade(label="Help", underline=0, menu=helpmenu)
- self.top.bind("<F1>", self.help)
- self.top.config(menu=menubar)
- def _select_columns(self):
- for (column, var) in self._column_vars.items():
- if var.get():
- self._table.show_column(column)
- else:
- self._table.hide_column(column)
- def _refresh(self):
- self._ds.clear_status_cache()
- try:
- self._fill_table()
- except HTTPError as e:
- showerror("Error reading from server", e)
- except URLError as e:
- showerror("Error connecting to server", e.reason)
- self._table.select(0)
- def _info_edit(self, info_key):
- self._info_save() # just in case.
- (entry, callback) = self._info[info_key]
- entry["state"] = "normal"
- entry["relief"] = "sunken"
- entry.focus()
- def _info_save(self, e=None):
- focus = self._table
- for entry, callback in self._info.values():
- if entry["state"] == "disabled":
- continue
- if e is not None and e.widget is entry and e.keysym != "Return":
- focus = entry
- else:
- entry["state"] = "disabled"
- entry["relief"] = "groove"
- callback(entry.get())
- focus.focus()
- def _table_reprfunc(self, row, col, val):
- if self._table.column_names[col].endswith("Size"):
- if isinstance(val, str):
- return " %s" % val
- elif val < 1024 ** 2:
- return " %.1f KB" % (val / 1024.0 ** 1)
- elif val < 1024 ** 3:
- return " %.1f MB" % (val / 1024.0 ** 2)
- else:
- return " %.1f GB" % (val / 1024.0 ** 3)
- if col in (0, ""):
- return str(val)
- else:
- return " %s" % val
- def _set_url(self, url):
- if url == self._ds.url:
- return
- try:
- self._ds.url = url
- self._fill_table()
- except IOError as e:
- showerror("Error Setting Server Index", str(e))
- self._show_info()
- def _set_download_dir(self, download_dir):
- if self._ds.download_dir == download_dir:
- return
- # check if the dir exists, and if not, ask if we should create it?
- # Clear our status cache, & re-check what's installed
- self._ds.download_dir = download_dir
- try:
- self._fill_table()
- except HTTPError as e:
- showerror("Error reading from server", e)
- except URLError as e:
- showerror("Error connecting to server", e.reason)
- self._show_info()
- def _show_info(self):
- print("showing info", self._ds.url)
- for entry, cb in self._info.values():
- entry["state"] = "normal"
- entry.delete(0, "end")
- self._info["url"][0].insert(0, self._ds.url)
- self._info["download_dir"][0].insert(0, self._ds.download_dir)
- for entry, cb in self._info.values():
- entry["state"] = "disabled"
- def _prev_tab(self, *e):
- for i, tab in enumerate(self._tab_names):
- if tab.lower() == self._tab and i > 0:
- self._tab = self._tab_names[i - 1].lower()
- try:
- return self._fill_table()
- except HTTPError as e:
- showerror("Error reading from server", e)
- except URLError as e:
- showerror("Error connecting to server", e.reason)
- def _next_tab(self, *e):
- for i, tab in enumerate(self._tab_names):
- if tab.lower() == self._tab and i < (len(self._tabs) - 1):
- self._tab = self._tab_names[i + 1].lower()
- try:
- return self._fill_table()
- except HTTPError as e:
- showerror("Error reading from server", e)
- except URLError as e:
- showerror("Error connecting to server", e.reason)
- def _select_tab(self, event):
- self._tab = event.widget["text"].lower()
- try:
- self._fill_table()
- except HTTPError as e:
- showerror("Error reading from server", e)
- except URLError as e:
- showerror("Error connecting to server", e.reason)
- _tab = "collections"
- # _tab = 'corpora'
- _rows = None
- def _fill_table(self):
- selected_row = self._table.selected_row()
- self._table.clear()
- if self._tab == "all packages":
- items = self._ds.packages()
- elif self._tab == "corpora":
- items = self._ds.corpora()
- elif self._tab == "models":
- items = self._ds.models()
- elif self._tab == "collections":
- items = self._ds.collections()
- else:
- assert 0, "bad tab value %r" % self._tab
- rows = [self._package_to_columns(item) for item in items]
- self._table.extend(rows)
- # Highlight the active tab.
- for tab, label in self._tabs.items():
- if tab == self._tab:
- label.configure(
- foreground=self._FRONT_TAB_COLOR[0],
- background=self._FRONT_TAB_COLOR[1],
- )
- else:
- label.configure(
- foreground=self._BACK_TAB_COLOR[0],
- background=self._BACK_TAB_COLOR[1],
- )
- self._table.sort_by("Identifier", order="ascending")
- self._color_table()
- self._table.select(selected_row)
- # This is a hack, because the scrollbar isn't updating its
- # position right -- I'm not sure what the underlying cause is
- # though. (This is on OS X w/ python 2.5) The length of
- # delay that's necessary seems to depend on how fast the
- # comptuer is. :-/
- self.top.after(150, self._table._scrollbar.set, *self._table._mlb.yview())
- self.top.after(300, self._table._scrollbar.set, *self._table._mlb.yview())
- def _update_table_status(self):
- for row_num in range(len(self._table)):
- status = self._ds.status(self._table[row_num, "Identifier"])
- self._table[row_num, "Status"] = status
- self._color_table()
- def _download(self, *e):
- # If we're using threads, then delegate to the threaded
- # downloader instead.
- if self._use_threads:
- return self._download_threaded(*e)
- marked = [
- self._table[row, "Identifier"]
- for row in range(len(self._table))
- if self._table[row, 0] != ""
- ]
- selection = self._table.selected_row()
- if not marked and selection is not None:
- marked = [self._table[selection, "Identifier"]]
- download_iter = self._ds.incr_download(marked, self._ds.download_dir)
- self._log_indent = 0
- self._download_cb(download_iter, marked)
- _DL_DELAY = 10
- def _download_cb(self, download_iter, ids):
- try:
- msg = next(download_iter)
- except StopIteration:
- # self._fill_table(sort=False)
- self._update_table_status()
- afterid = self.top.after(10, self._show_progress, 0)
- self._afterid["_download_cb"] = afterid
- return
- def show(s):
- self._progresslabel["text"] = s
- self._log(s)
- if isinstance(msg, ProgressMessage):
- self._show_progress(msg.progress)
- elif isinstance(msg, ErrorMessage):
- show(msg.message)
- if msg.package is not None:
- self._select(msg.package.id)
- self._show_progress(None)
- return # halt progress.
- elif isinstance(msg, StartCollectionMessage):
- show("Downloading collection %s" % msg.collection.id)
- self._log_indent += 1
- elif isinstance(msg, StartPackageMessage):
- show("Downloading package %s" % msg.package.id)
- elif isinstance(msg, UpToDateMessage):
- show("Package %s is up-to-date!" % msg.package.id)
- # elif isinstance(msg, StaleMessage):
- # show('Package %s is out-of-date or corrupt' % msg.package.id)
- elif isinstance(msg, FinishDownloadMessage):
- show("Finished downloading %r." % msg.package.id)
- elif isinstance(msg, StartUnzipMessage):
- show("Unzipping %s" % msg.package.filename)
- elif isinstance(msg, FinishCollectionMessage):
- self._log_indent -= 1
- show("Finished downloading collection %r." % msg.collection.id)
- self._clear_mark(msg.collection.id)
- elif isinstance(msg, FinishPackageMessage):
- self._clear_mark(msg.package.id)
- afterid = self.top.after(self._DL_DELAY, self._download_cb, download_iter, ids)
- self._afterid["_download_cb"] = afterid
- def _select(self, id):
- for row in range(len(self._table)):
- if self._table[row, "Identifier"] == id:
- self._table.select(row)
- return
- def _color_table(self):
- # Color rows according to status.
- for row in range(len(self._table)):
- bg, sbg = self._ROW_COLOR[self._table[row, "Status"]]
- fg, sfg = ("black", "white")
- self._table.rowconfig(
- row,
- foreground=fg,
- selectforeground=sfg,
- background=bg,
- selectbackground=sbg,
- )
- # Color the marked column
- self._table.itemconfigure(
- row, 0, foreground=self._MARK_COLOR[0], background=self._MARK_COLOR[1]
- )
- def _clear_mark(self, id):
- for row in range(len(self._table)):
- if self._table[row, "Identifier"] == id:
- self._table[row, 0] = ""
- def _mark_all(self, *e):
- for row in range(len(self._table)):
- self._table[row, 0] = "X"
- def _table_mark(self, *e):
- selection = self._table.selected_row()
- if selection >= 0:
- if self._table[selection][0] != "":
- self._table[selection, 0] = ""
- else:
- self._table[selection, 0] = "X"
- self._table.select(delta=1)
- def _show_log(self):
- text = "\n".join(self._log_messages)
- ShowText(self.top, "NLTK Downloader Log", text)
- def _package_to_columns(self, pkg):
- """
- Given a package, return a list of values describing that
- package, one for each column in ``self.COLUMNS``.
- """
- row = []
- for column_index, column_name in enumerate(self.COLUMNS):
- if column_index == 0: # Mark:
- row.append("")
- elif column_name == "Identifier":
- row.append(pkg.id)
- elif column_name == "Status":
- row.append(self._ds.status(pkg))
- else:
- attr = column_name.lower().replace(" ", "_")
- row.append(getattr(pkg, attr, "n/a"))
- return row
- # /////////////////////////////////////////////////////////////////
- # External Interface
- # /////////////////////////////////////////////////////////////////
- def destroy(self, *e):
- if self._destroyed:
- return
- self.top.destroy()
- self._destroyed = True
- def _destroy(self, *e):
- if self.top is not None:
- for afterid in self._afterid.values():
- self.top.after_cancel(afterid)
- # Abort any download in progress.
- if self._downloading and self._use_threads:
- self._abort_download()
- # Make sure the garbage collector destroys these now;
- # otherwise, they may get destroyed when we're not in the main
- # thread, which would make Tkinter unhappy.
- self._column_vars.clear()
- def mainloop(self, *args, **kwargs):
- self.top.mainloop(*args, **kwargs)
- # /////////////////////////////////////////////////////////////////
- # HELP
- # /////////////////////////////////////////////////////////////////
- HELP = textwrap.dedent(
- """\
- This tool can be used to download a variety of corpora and models
- that can be used with NLTK. Each corpus or model is distributed
- in a single zip file, known as a \"package file.\" You can
- download packages individually, or you can download pre-defined
- collections of packages.
- When you download a package, it will be saved to the \"download
- directory.\" A default download directory is chosen when you run
- the downloader; but you may also select a different download
- directory. On Windows, the default download directory is
- \"package.\"
- The NLTK downloader can be used to download a variety of corpora,
- models, and other data packages.
- Keyboard shortcuts::
- [return]\t Download
- [up]\t Select previous package
- [down]\t Select next package
- [left]\t Select previous tab
- [right]\t Select next tab
- """
- )
- def help(self, *e):
- # The default font's not very legible; try using 'fixed' instead.
- try:
- ShowText(
- self.top,
- "Help: NLTK Dowloader",
- self.HELP.strip(),
- width=75,
- font="fixed",
- )
- except:
- ShowText(self.top, "Help: NLTK Downloader", self.HELP.strip(), width=75)
- def about(self, *e):
- ABOUT = "NLTK Downloader\n" + "Written by Edward Loper"
- TITLE = "About: NLTK Downloader"
- try:
- from tkinter.messagebox import Message
- Message(message=ABOUT, title=TITLE).show()
- except ImportError:
- ShowText(self.top, TITLE, ABOUT)
- # /////////////////////////////////////////////////////////////////
- # Progress Bar
- # /////////////////////////////////////////////////////////////////
- _gradient_width = 5
- def _init_progressbar(self):
- c = self._progressbar
- width, height = int(c["width"]), int(c["height"])
- for i in range(0, (int(c["width"]) * 2) // self._gradient_width):
- c.create_line(
- i * self._gradient_width + 20,
- -20,
- i * self._gradient_width - height - 20,
- height + 20,
- width=self._gradient_width,
- fill="#%02x0000" % (80 + abs(i % 6 - 3) * 12),
- )
- c.addtag_all("gradient")
- c.itemconfig("gradient", state="hidden")
- # This is used to display progress
- c.addtag_withtag(
- "redbox", c.create_rectangle(0, 0, 0, 0, fill=self._PROGRESS_COLOR[0])
- )
- def _show_progress(self, percent):
- c = self._progressbar
- if percent is None:
- c.coords("redbox", 0, 0, 0, 0)
- c.itemconfig("gradient", state="hidden")
- else:
- width, height = int(c["width"]), int(c["height"])
- x = percent * int(width) // 100 + 1
- c.coords("redbox", 0, 0, x, height + 1)
- def _progress_alive(self):
- c = self._progressbar
- if not self._downloading:
- c.itemconfig("gradient", state="hidden")
- else:
- c.itemconfig("gradient", state="normal")
- x1, y1, x2, y2 = c.bbox("gradient")
- if x1 <= -100:
- c.move("gradient", (self._gradient_width * 6) - 4, 0)
- else:
- c.move("gradient", -4, 0)
- afterid = self.top.after(200, self._progress_alive)
- self._afterid["_progress_alive"] = afterid
- # /////////////////////////////////////////////////////////////////
- # Threaded downloader
- # /////////////////////////////////////////////////////////////////
- def _download_threaded(self, *e):
- # If the user tries to start a new download while we're already
- # downloading something, then abort the current download instead.
- if self._downloading:
- self._abort_download()
- return
- # Change the 'download' button to an 'abort' button.
- self._download_button["text"] = "Cancel"
- marked = [
- self._table[row, "Identifier"]
- for row in range(len(self._table))
- if self._table[row, 0] != ""
- ]
- selection = self._table.selected_row()
- if not marked and selection is not None:
- marked = [self._table[selection, "Identifier"]]
- # Create a new data server object for the download operation,
- # just in case the user modifies our data server during the
- # download (e.g., clicking 'refresh' or editing the index url).
- ds = Downloader(self._ds.url, self._ds.download_dir)
- # Start downloading in a separate thread.
- assert self._download_msg_queue == []
- assert self._download_abort_queue == []
- self._DownloadThread(
- ds,
- marked,
- self._download_lock,
- self._download_msg_queue,
- self._download_abort_queue,
- ).start()
- # Monitor the download message queue & display its progress.
- self._log_indent = 0
- self._downloading = True
- self._monitor_message_queue()
- # Display an indication that we're still alive and well by
- # cycling the progress bar.
- self._progress_alive()
- def _abort_download(self):
- if self._downloading:
- self._download_lock.acquire()
- self._download_abort_queue.append("abort")
- self._download_lock.release()
- class _DownloadThread(threading.Thread):
- def __init__(self, data_server, items, lock, message_queue, abort):
- self.data_server = data_server
- self.items = items
- self.lock = lock
- self.message_queue = message_queue
- self.abort = abort
- threading.Thread.__init__(self)
- def run(self):
- for msg in self.data_server.incr_download(self.items):
- self.lock.acquire()
- self.message_queue.append(msg)
- # Check if we've been told to kill ourselves:
- if self.abort:
- self.message_queue.append("aborted")
- self.lock.release()
- return
- self.lock.release()
- self.lock.acquire()
- self.message_queue.append("finished")
- self.lock.release()
- _MONITOR_QUEUE_DELAY = 100
- def _monitor_message_queue(self):
- def show(s):
- self._progresslabel["text"] = s
- self._log(s)
- # Try to acquire the lock; if it's busy, then just try again later.
- if not self._download_lock.acquire():
- return
- for msg in self._download_msg_queue:
- # Done downloading?
- if msg == "finished" or msg == "aborted":
- # self._fill_table(sort=False)
- self._update_table_status()
- self._downloading = False
- self._download_button["text"] = "Download"
- del self._download_msg_queue[:]
- del self._download_abort_queue[:]
- self._download_lock.release()
- if msg == "aborted":
- show("Download aborted!")
- self._show_progress(None)
- else:
- afterid = self.top.after(100, self._show_progress, None)
- self._afterid["_monitor_message_queue"] = afterid
- return
- # All other messages
- elif isinstance(msg, ProgressMessage):
- self._show_progress(msg.progress)
- elif isinstance(msg, ErrorMessage):
- show(msg.message)
- if msg.package is not None:
- self._select(msg.package.id)
- self._show_progress(None)
- self._downloading = False
- return # halt progress.
- elif isinstance(msg, StartCollectionMessage):
- show("Downloading collection %r" % msg.collection.id)
- self._log_indent += 1
- elif isinstance(msg, StartPackageMessage):
- self._ds.clear_status_cache(msg.package.id)
- show("Downloading package %r" % msg.package.id)
- elif isinstance(msg, UpToDateMessage):
- show("Package %s is up-to-date!" % msg.package.id)
- # elif isinstance(msg, StaleMessage):
- # show('Package %s is out-of-date or corrupt; updating it' %
- # msg.package.id)
- elif isinstance(msg, FinishDownloadMessage):
- show("Finished downloading %r." % msg.package.id)
- elif isinstance(msg, StartUnzipMessage):
- show("Unzipping %s" % msg.package.filename)
- elif isinstance(msg, FinishUnzipMessage):
- show("Finished installing %s" % msg.package.id)
- elif isinstance(msg, FinishCollectionMessage):
- self._log_indent -= 1
- show("Finished downloading collection %r." % msg.collection.id)
- self._clear_mark(msg.collection.id)
- elif isinstance(msg, FinishPackageMessage):
- self._update_table_status()
- self._clear_mark(msg.package.id)
- # Let the user know when we're aborting a download (but
- # waiting for a good point to abort it, so we don't end up
- # with a partially unzipped package or anything like that).
- if self._download_abort_queue:
- self._progresslabel["text"] = "Aborting download..."
- # Clear the message queue and then release the lock
- del self._download_msg_queue[:]
- self._download_lock.release()
- # Check the queue again after MONITOR_QUEUE_DELAY msec.
- afterid = self.top.after(self._MONITOR_QUEUE_DELAY, self._monitor_message_queue)
- self._afterid["_monitor_message_queue"] = afterid
- ######################################################################
- # Helper Functions
- ######################################################################
- # [xx] It may make sense to move these to nltk.internals.
- def md5_hexdigest(file):
- """
- Calculate and return the MD5 checksum for a given file.
- ``file`` may either be a filename or an open stream.
- """
- if isinstance(file, str):
- with open(file, "rb") as infile:
- return _md5_hexdigest(infile)
- return _md5_hexdigest(file)
- def _md5_hexdigest(fp):
- md5_digest = md5()
- while True:
- block = fp.read(1024 * 16) # 16k blocks
- if not block:
- break
- md5_digest.update(block)
- return md5_digest.hexdigest()
- # change this to periodically yield progress messages?
- # [xx] get rid of topdir parameter -- we should be checking
- # this when we build the index, anyway.
- def unzip(filename, root, verbose=True):
- """
- Extract the contents of the zip file ``filename`` into the
- directory ``root``.
- """
- for message in _unzip_iter(filename, root, verbose):
- if isinstance(message, ErrorMessage):
- raise Exception(message)
- def _unzip_iter(filename, root, verbose=True):
- if verbose:
- sys.stdout.write("Unzipping %s" % os.path.split(filename)[1])
- sys.stdout.flush()
- try:
- zf = zipfile.ZipFile(filename)
- except zipfile.error as e:
- yield ErrorMessage(filename, "Error with downloaded zip file")
- return
- except Exception as e:
- yield ErrorMessage(filename, e)
- return
- zf.extractall(root)
- if verbose:
- print()
- ######################################################################
- # Index Builder
- ######################################################################
- # This may move to a different file sometime.
- def build_index(root, base_url):
- """
- Create a new data.xml index file, by combining the xml description
- files for various packages and collections. ``root`` should be the
- path to a directory containing the package xml and zip files; and
- the collection xml files. The ``root`` directory is expected to
- have the following subdirectories::
- root/
- packages/ .................. subdirectory for packages
- corpora/ ................. zip & xml files for corpora
- grammars/ ................ zip & xml files for grammars
- taggers/ ................. zip & xml files for taggers
- tokenizers/ .............. zip & xml files for tokenizers
- etc.
- collections/ ............... xml files for collections
- For each package, there should be two files: ``package.zip``
- (where *package* is the package name)
- which contains the package itself as a compressed zip file; and
- ``package.xml``, which is an xml description of the package. The
- zipfile ``package.zip`` should expand to a single subdirectory
- named ``package/``. The base filename ``package`` must match
- the identifier given in the package's xml file.
- For each collection, there should be a single file ``collection.zip``
- describing the collection, where *collection* is the name of the collection.
- All identifiers (for both packages and collections) must be unique.
- """
- # Find all packages.
- packages = []
- for pkg_xml, zf, subdir in _find_packages(os.path.join(root, "packages")):
- zipstat = os.stat(zf.filename)
- url = "%s/%s/%s" % (base_url, subdir, os.path.split(zf.filename)[1])
- unzipped_size = sum(zf_info.file_size for zf_info in zf.infolist())
- # Fill in several fields of the package xml with calculated values.
- pkg_xml.set("unzipped_size", "%s" % unzipped_size)
- pkg_xml.set("size", "%s" % zipstat.st_size)
- pkg_xml.set("checksum", "%s" % md5_hexdigest(zf.filename))
- pkg_xml.set("subdir", subdir)
- # pkg_xml.set('svn_revision', _svn_revision(zf.filename))
- if not pkg_xml.get("url"):
- pkg_xml.set("url", url)
- # Record the package.
- packages.append(pkg_xml)
- # Find all collections
- collections = list(_find_collections(os.path.join(root, "collections")))
- # Check that all UIDs are unique
- uids = set()
- for item in packages + collections:
- if item.get("id") in uids:
- raise ValueError("Duplicate UID: %s" % item.get("id"))
- uids.add(item.get("id"))
- # Put it all together
- top_elt = ElementTree.Element("nltk_data")
- top_elt.append(ElementTree.Element("packages"))
- for package in packages:
- top_elt[0].append(package)
- top_elt.append(ElementTree.Element("collections"))
- for collection in collections:
- top_elt[1].append(collection)
- _indent_xml(top_elt)
- return top_elt
- def _indent_xml(xml, prefix=""):
- """
- Helper for ``build_index()``: Given an XML ``ElementTree``, modify it
- (and its descendents) ``text`` and ``tail`` attributes to generate
- an indented tree, where each nested element is indented by 2
- spaces with respect to its parent.
- """
- if len(xml) > 0:
- xml.text = (xml.text or "").strip() + "\n" + prefix + " "
- for child in xml:
- _indent_xml(child, prefix + " ")
- for child in xml[:-1]:
- child.tail = (child.tail or "").strip() + "\n" + prefix + " "
- xml[-1].tail = (xml[-1].tail or "").strip() + "\n" + prefix
- def _check_package(pkg_xml, zipfilename, zf):
- """
- Helper for ``build_index()``: Perform some checks to make sure that
- the given package is consistent.
- """
- # The filename must patch the id given in the XML file.
- uid = os.path.splitext(os.path.split(zipfilename)[1])[0]
- if pkg_xml.get("id") != uid:
- raise ValueError(
- "package identifier mismatch (%s vs %s)" % (pkg_xml.get("id"), uid)
- )
- # Zip file must expand to a subdir whose name matches uid.
- if sum((name != uid and not name.startswith(uid + "/")) for name in zf.namelist()):
- raise ValueError(
- "Zipfile %s.zip does not expand to a single "
- "subdirectory %s/" % (uid, uid)
- )
- # update for git?
- def _svn_revision(filename):
- """
- Helper for ``build_index()``: Calculate the subversion revision
- number for a given file (by using ``subprocess`` to run ``svn``).
- """
- p = subprocess.Popen(
- ["svn", "status", "-v", filename],
- stdout=subprocess.PIPE,
- stderr=subprocess.PIPE,
- )
- (stdout, stderr) = p.communicate()
- if p.returncode != 0 or stderr or not stdout:
- raise ValueError(
- "Error determining svn_revision for %s: %s"
- % (os.path.split(filename)[1], textwrap.fill(stderr))
- )
- return stdout.split()[2]
- def _find_collections(root):
- """
- Helper for ``build_index()``: Yield a list of ElementTree.Element
- objects, each holding the xml for a single package collection.
- """
- packages = []
- for dirname, subdirs, files in os.walk(root):
- for filename in files:
- if filename.endswith(".xml"):
- xmlfile = os.path.join(dirname, filename)
- yield ElementTree.parse(xmlfile).getroot()
- def _find_packages(root):
- """
- Helper for ``build_index()``: Yield a list of tuples
- ``(pkg_xml, zf, subdir)``, where:
- - ``pkg_xml`` is an ``ElementTree.Element`` holding the xml for a
- package
- - ``zf`` is a ``zipfile.ZipFile`` for the package's contents.
- - ``subdir`` is the subdirectory (relative to ``root``) where
- the package was found (e.g. 'corpora' or 'grammars').
- """
- from nltk.corpus.reader.util import _path_from
- # Find all packages.
- packages = []
- for dirname, subdirs, files in os.walk(root):
- relpath = "/".join(_path_from(root, dirname))
- for filename in files:
- if filename.endswith(".xml"):
- xmlfilename = os.path.join(dirname, filename)
- zipfilename = xmlfilename[:-4] + ".zip"
- try:
- zf = zipfile.ZipFile(zipfilename)
- except Exception as e:
- raise ValueError("Error reading file %r!\n%s" % (zipfilename, e))
- try:
- pkg_xml = ElementTree.parse(xmlfilename).getroot()
- except Exception as e:
- raise ValueError("Error reading file %r!\n%s" % (xmlfilename, e))
- # Check that the UID matches the filename
- uid = os.path.split(xmlfilename[:-4])[1]
- if pkg_xml.get("id") != uid:
- raise ValueError(
- "package identifier mismatch (%s "
- "vs %s)" % (pkg_xml.get("id"), uid)
- )
- # Check that the zipfile expands to a subdir whose
- # name matches the uid.
- if sum(
- (name != uid and not name.startswith(uid + "/"))
- for name in zf.namelist()
- ):
- raise ValueError(
- "Zipfile %s.zip does not expand to a "
- "single subdirectory %s/" % (uid, uid)
- )
- yield pkg_xml, zf, relpath
- # Don't recurse into svn subdirectories:
- try:
- subdirs.remove(".svn")
- except ValueError:
- pass
- ######################################################################
- # Main:
- ######################################################################
- # There should be a command-line interface
- # Aliases
- _downloader = Downloader()
- download = _downloader.download
- def download_shell():
- DownloaderShell(_downloader).run()
- def download_gui():
- DownloaderGUI(_downloader).mainloop()
- def update():
- _downloader.update()
- if __name__ == "__main__":
- from optparse import OptionParser
- parser = OptionParser()
- parser.add_option(
- "-d",
- "--dir",
- dest="dir",
- help="download package to directory DIR",
- metavar="DIR",
- )
- parser.add_option(
- "-q",
- "--quiet",
- dest="quiet",
- action="store_true",
- default=False,
- help="work quietly",
- )
- parser.add_option(
- "-f",
- "--force",
- dest="force",
- action="store_true",
- default=False,
- help="download even if already installed",
- )
- parser.add_option(
- "-e",
- "--exit-on-error",
- dest="halt_on_error",
- action="store_true",
- default=False,
- help="exit if an error occurs",
- )
- parser.add_option(
- "-u",
- "--url",
- dest="server_index_url",
- default=os.environ.get("NLTK_DOWNLOAD_URL"),
- help="download server index url",
- )
- (options, args) = parser.parse_args()
- downloader = Downloader(server_index_url=options.server_index_url)
- if args:
- for pkg_id in args:
- rv = downloader.download(
- info_or_id=pkg_id,
- download_dir=options.dir,
- quiet=options.quiet,
- force=options.force,
- halt_on_error=options.halt_on_error,
- )
- if rv == False and options.halt_on_error:
- break
- else:
- downloader.download(
- download_dir=options.dir,
- quiet=options.quiet,
- force=options.force,
- halt_on_error=options.halt_on_error,
- )
|