Thought I'd give my 2 cents on this even if a bit late to the party.
nltk has two functions: download and downloader.
download() already contains logic that checks if the package is downloaded & up to date:
from pathlib import Path
from nltk import download as nltk_download
from typing import List, Any
from nltk.downloader import Downloader
import logging
def download_nltk_data(
list_of_resources: List[str],
download_dir: Path,
) -> None:
for resource in list_of_resources:
nltk_download(
info_or_id=resource,
download_dir=download_dir,
quiet=True, # Change this if you wanna suppress the message
)
download_nltk_data(
list_of_resources=[
'stopwords',
'punkt',
],
download_dir=Path('./data/nltk/'),
)
Output:
[nltk_data] Downloading package stopwords to data\nltk...
[nltk_data] Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to data\nltk...
[nltk_data] Package punkt is already up-to-date!
All you need to do is change quiet=True if you want to suppress this.
If for some reason you want finer control over the packages you can use the Downloader class and extend functionality:
def check_package_exists(
package_id: Any,
download_dir: Path,
) -> bool:
downloader = Downloader(download_dir=str(download_dir))
return downloader.is_installed(package_id)
def download_nltk_data(
list_of_resources: List[str],
download_dir: Path,
) -> None:
download_dir.mkdir(parents=True, exist_ok=True)
downloader = Downloader(download_dir=str(download_dir))
for resource in list_of_resources:
if not check_package_exists(resource, download_dir):
logging.debug(f'Downloading {resource} to {download_dir}')
downloader.download(info_or_id=resource, quiet=True)
else:
logging.debug(f'{resource} already exists in {download_dir}')
download_nltk_data(
list_of_resources=[
'stopwords',
'punkt',
],
download_dir=Path('./data/nltk/'),
)
Output:
stopwords already exists in data\nltk
punkt already exists in data\nltk
Or something like that