import hashlib
import io
import json
import logging
import os
import pathlib
import requests
import rfc6266
import uuid
def _calculate_sha1(in_file):
m = hashlib.sha1()
for chunk in iter(lambda: in_file.read(8192), b''):
m.update(chunk)
return m.hexdigest()
[docs]class Upload:
"""Start an Upload session.
:param str server_url: Website to upload to. For example:
``https://www.overviewdocs.com``
:param str api_token: String from
https://www.overviewdocs.com/documentsets/XXXX/api-tokens
"""
def __init__(self, server_url, api_token, logger=None):
if logger is None:
logger = logging.getLogger('{}.Upload'.format(__name__))
self.server_url = server_url
self.api_token = api_token
self.logger = logger
self.n_uploaded = 0
def _request(self, method, path, **kwargs):
url = '{}{}'.format(self.server_url, path)
self.logger.debug('%s %s', method, url)
# We need a "nested" dict.update(), for the headers argument. If the
# caller specifies headers, we want them to override our defaults.
request_headers = {
'X-Requested-With': 'overview_upload',
}
if 'headers' in kwargs:
request_headers.update(kwargs['headers'])
request_kwargs = {
'auth': (self.api_token, 'x-auth-token'),
}
request_kwargs.update(kwargs)
request_kwargs['headers'] = request_headers
return requests.request(method, url, **request_kwargs)
[docs] def clear_previous_upload(self):
"""Remove any previously uploaded files from the server.
If you *don't* call this, then when you ``finish()`` you may find
Overview adds files you uploaded sometime in the past and then forgot
about.
"""
self.logger.info('Clearing previous uploads…')
r = self._request('DELETE', '/api/v1/files')
r.raise_for_status()
[docs] def send_directory(self, dirname, skip_unhandled_extension=True, skip_duplicate=True, metadata=None):
"""Upload all files in a directory to the Overview server.
If ``skip_duplicate == False``, then files will be streamed to the
server. Otherwise, this method will cache each file in memory during
upload.
:param str dirname: Directory to upload.
:param bool skip_unhandled_extension: if ``True`` (the default), do not
upload files when Overview doesn't support their filename extensions
(for instance, ``".dbf"``).
:param bool skip_duplicate: if ``True`` (the default), do not upload a
file if ``api_token`` points to a document set that already contains a
file whose sha1 hash is identical to this file's. Files that have been
sent without a call to ``finish()`` will not be included in the check.
If ``False``, stream files instead of caching them.
:param dict metadata: Metadata to set on every document, or ``None``.
The document set should have a metadata schema that corresponds to
this document's metadata (or you can set the schema later).
"""
for path in pathlib.Path(dirname).glob('**/*'):
filename = str(path.relative_to(dirname)) # visible on the server
# Don't upload hidden files (e.g., ".DS_Store" on Mac OS)
if filename[0] == '.' or '/.' in filename or '\\.' in filename:
continue
if path.is_file():
self.send_path_if_conditions_met(
path,
filename,
skip_unhandled_extension=skip_unhandled_extension,
skip_duplicate=skip_duplicate,
metadata=metadata
)
[docs] def send_path_if_conditions_met(self, path, filename, skip_unhandled_extension=True, skip_duplicate=True, metadata=None):
"""Upload the file at the specified Path to the Overview server.
The file will be streamed: that is, the script does not risk running out
of memory.
:param pathlib.Path path: absolute or relative pathlib.Path pointing
to the document.
:param str filename: filename Overview should use.
:param bool skip_unhandled_extension: -- if ``True`` (the default), do
not upload this file if Overview doesn't support its filename
extension (for instance, ``".dbf"``).
:param bool skip_duplicate: if ``True`` (the default), do not upload
this file if your api_token points to a document set that already
contains a file whose sha1 hash is identical to this file's. Files
that have been sent but not finish()ed will not be included in the
check.
:param dict metadata: Metadata to set on the document, or ``None``.
The document set should have a metadata schema that corresponds to
this document's metadata (or you can set the schema later).
"""
n_bytes = path.stat().st_size
sha1 = None
if skip_duplicate:
# we need to stream the file to calculate sha1, and then we need
# to stream it again to send it to the server. Do that by
# opening the file twice: the alternative is to read the entire
# file into memory, which can be huge.
with path.open('rb') as in_file:
sha1 = _calculate_sha1(in_file)
with path.open('rb', buffering=8192) as in_file:
self.send_file_if_conditions_met(
in_file,
filename,
n_bytes=n_bytes,
skip_unhandled_extension=skip_unhandled_extension,
skip_duplicate=skip_duplicate,
metadata=metadata,
sha1=sha1
)
[docs] def send_file_if_conditions_met(self, in_file, filename, n_bytes=None, skip_unhandled_extension=True, skip_duplicate=True, metadata=None, sha1=None):
"""Upload a file to the Overview server.
If ``n_bytes is None or (skip_duplicate == True and sha1 is None)``,
then ``in_file`` will be cached in memory. Otherwise, it will be
streamed to the server, saving memory.
:param io.BytesIO in_file: BytesIO containing the document.
:param str filename: Filename to set in Overview.
:param int n_bytes: Exact file size (or `None` to auto-calculate).
Supply this and ``sha1`` (if applicable) to stream ``in_file`` to
the server instead of caching it in memory.
:param bool skip_unhandled_extension: if ``True`` (the default), do not
upload this file if Overview doesn't support its filename extension
(for instance, ``".dbf"``).
:param bool skip_duplicate: if ``True`` (the default), do not upload
this file if your api_token points to a document set that already
contains a file whose sha1 hash is identical to this file's. Files
that have been sent without a call to ``finish()`` will not be
included in the check.
:param dict metadata: Metadata to set on the document, or ``None``.
The document set should have a metadata schema that corresponds to
this document's metadata (or you can set the schema later).
:param str sha1: SHA1 hash:to use in ``skip_duplicate()`` check, or
``None`` to calculate on the fly. If you set this and ``n_bytes``,
this method will stream the file contents instead of caching them
in memory.
"""
if skip_unhandled_extension:
# We go by filename, with a blacklist we know Overview doesn't handle (yet)
path, ext = os.path.splitext(filename)
if ext.lower() in ['.zip', '.msg', '.gif', '.jpg', '.png', '.tiff', '.tif', '.dbf']:
self.logger.info('Skipping %s, Overview does not handle this format', filename)
return
if skip_duplicate:
if sha1 is None:
# Cache in_file bytes in memory so we can read it twice: once in
# is_file_already_in_document_set(), and once below.
in_file = io.BytesIO(in_file.read())
sha1 = _calculate_sha1(in_file)
in_file.seek(0)
if self.is_file_already_in_document_set(in_file, sha1):
self.logger.info('Skipping %s, already on server', filename)
return
if n_bytes is None:
# Cache in_file bytes in memory so we can read it twice: once
# here, once below
in_file = io.BytesIO(in_file.read())
n_bytes = in_file.getbuffer().nbytes
server_path = '/api/v1/files/{}'.format(uuid.uuid4())
headers = {
'Content-Disposition': rfc6266.build_header(filename),
'Content-Length': str(n_bytes),
}
if metadata:
headers['Overview-Document-Metadata-JSON'] = json.dumps(metadata, ensure_ascii=True)
self.logger.info('Uploading %s…', filename)
r = self._request('POST', server_path, headers=headers, data=in_file)
r.raise_for_status()
self.n_uploaded += 1
[docs] def is_file_already_in_document_set(self, in_file, sha1=None):
"""Return True iff the document set contains an identical file.
This works by calculating the SHA1 and asking Overview whether it's been
seen before in our document set. Files sent without a call to
``finish()`` will not be included in this check.
:param io.BytesIO in_file: bytes to upload to Overview.
:param str sha1: if set, assume the given SHA1 hash instead of computing
it by reading the file. If ``None`` (the default), then ``in_file``
will be read completely.
"""
if sha1 is None:
sha1 = _calculate_sha1(in_file)
r = self._request('HEAD', '/api/v1/document-sets/files/{}'.format(sha1))
if r.status_code == 204:
return True
elif r.status_code == 404:
return False
else:
r.raise_for_status()
[docs] def finish(self, lang='en', ocr=True, split_by_page=False):
"""Adds sent files to the document set.
:param str lang: ISO language code for Overview's analysis (default is
``"en"``)
:param bool ocr: if ``True`` (the default), tell Overview to read text
from PDF pages that contain only images.
:param bool split_by_page: if ``True``, tell Overview to create a
document per page of the input file. (This only applies to PDFs and
LibreOffice-compatible documents.) If ``False`` (the default), tell
Overview to create one document per uploaded file.
"""
if self.n_uploaded == 0:
self.logger.info('No files uploaded')
return
# http://docs.overviewproject.apiary.io/#reference/files/finish-uploading-files/add-files-to-document-set?console=1
self.logger.info('Finishing…')
r = self._request('POST', '/api/v1/files/finish', json={
'lang': lang,
'ocr': ocr,
'split_documents': split_by_page,
})
r.raise_for_status()
self.logger.info(
'Finished uploading %d file(s). Browse to %s/documentsets to watch progress',
self.n_uploaded,
self.server_url
)