Source code for overview_upload._upload

import hashlib
import io
import json
import logging
import os
import pathlib
import requests
import rfc6266
import uuid

def _calculate_sha1(in_file):
    m = hashlib.sha1()
    for chunk in iter(lambda: in_file.read(8192), b''):
        m.update(chunk)
    return m.hexdigest()

[docs]class Upload:
    """Start an Upload session.

    :param str server_url: Website to upload to. For example:
        ``https://www.overviewdocs.com``
    :param str api_token: String from
        https://www.overviewdocs.com/documentsets/XXXX/api-tokens
    """

    def __init__(self, server_url, api_token, logger=None):
        if logger is None:
            logger = logging.getLogger('{}.Upload'.format(__name__))

        self.server_url = server_url
        self.api_token = api_token
        self.logger = logger
        self.n_uploaded = 0

    def _request(self, method, path, **kwargs):
        url = '{}{}'.format(self.server_url, path)
        self.logger.debug('%s %s', method, url)

        # We need a "nested" dict.update(), for the headers argument. If the
        # caller specifies headers, we want them to override our defaults.
        request_headers = {
            'X-Requested-With': 'overview_upload',
        }
        if 'headers' in kwargs:
            request_headers.update(kwargs['headers'])

        request_kwargs = {
                'auth': (self.api_token, 'x-auth-token'),
        }
        request_kwargs.update(kwargs)
        request_kwargs['headers'] = request_headers

        return requests.request(method, url, **request_kwargs)

[docs]    def clear_previous_upload(self):
        """Remove any previously uploaded files from the server.

        If you *don't* call this, then when you ``finish()`` you may find
        Overview adds files you uploaded sometime in the past and then forgot
        about.
        """
        self.logger.info('Clearing previous uploads…')
        r = self._request('DELETE', '/api/v1/files')
        r.raise_for_status()

[docs]    def send_directory(self, dirname, skip_unhandled_extension=True, skip_duplicate=True, metadata=None):
        """Upload all files in a directory to the Overview server.

        If ``skip_duplicate == False``, then files will be streamed to the
        server. Otherwise, this method will cache each file in memory during
        upload.

        :param str dirname: Directory to upload.
        :param bool skip_unhandled_extension: if ``True`` (the default), do not
            upload files when Overview doesn't support their filename extensions
            (for instance, ``".dbf"``).
        :param bool skip_duplicate: if ``True`` (the default), do not upload a
            file if ``api_token`` points to a document set that already contains a
            file whose sha1 hash is identical to this file's. Files that have been
            sent without a call to ``finish()`` will not be included in the check.
            If ``False``, stream files instead of caching them.
        :param dict metadata: Metadata to set on every document, or ``None``.
            The document set should have a metadata schema that corresponds to
            this document's metadata (or you can set the schema later).
        """
        for path in pathlib.Path(dirname).glob('**/*'):
            filename = str(path.relative_to(dirname)) # visible on the server

            # Don't upload hidden files (e.g., ".DS_Store" on Mac OS)
            if filename[0] == '.' or '/.' in filename or '\\.' in filename:
                continue

            if path.is_file():
                self.send_path_if_conditions_met(
                    path,
                    filename,
                    skip_unhandled_extension=skip_unhandled_extension,
                    skip_duplicate=skip_duplicate,
                    metadata=metadata
                )

[docs]    def send_path_if_conditions_met(self, path, filename, skip_unhandled_extension=True, skip_duplicate=True, metadata=None):
        """Upload the file at the specified Path to the Overview server.

        The file will be streamed: that is, the script does not risk running out
        of memory.

        :param pathlib.Path path: absolute or relative pathlib.Path pointing
            to the document.
        :param str filename: filename Overview should use.
        :param bool skip_unhandled_extension: -- if ``True`` (the default), do
            not upload this file if Overview doesn't support its filename
            extension (for instance, ``".dbf"``).
        :param bool skip_duplicate: if ``True`` (the default), do not upload
            this file if your api_token points to a document set that already
            contains a file whose sha1 hash is identical to this file's. Files
            that have been sent but not finish()ed will not be included in the
            check.
        :param dict metadata: Metadata to set on the document, or ``None``.
            The document set should have a metadata schema that corresponds to
            this document's metadata (or you can set the schema later).
        """
        n_bytes = path.stat().st_size

        sha1 = None
        if skip_duplicate:
            # we need to stream the file to calculate sha1, and then we need
            # to stream it again to send it to the server. Do that by
            # opening the file twice: the alternative is to read the entire
            # file into memory, which can be huge.
            with path.open('rb') as in_file:
                sha1 = _calculate_sha1(in_file)

        with path.open('rb', buffering=8192) as in_file:
            self.send_file_if_conditions_met(
                in_file,
                filename,
                n_bytes=n_bytes,
                skip_unhandled_extension=skip_unhandled_extension,
                skip_duplicate=skip_duplicate,
                metadata=metadata,
                sha1=sha1
            )

[docs]    def send_file_if_conditions_met(self, in_file, filename, n_bytes=None, skip_unhandled_extension=True, skip_duplicate=True, metadata=None, sha1=None):
        """Upload a file to the Overview server.

        If ``n_bytes is None or (skip_duplicate == True and sha1 is None)``,
        then ``in_file`` will be cached in memory. Otherwise, it will be
        streamed to the server, saving memory.

        :param io.BytesIO in_file: BytesIO containing the document.
        :param str filename: Filename to set in Overview.
        :param int n_bytes: Exact file size (or `None` to auto-calculate).
            Supply this and ``sha1`` (if applicable) to stream ``in_file`` to
            the server instead of caching it in memory.
        :param bool skip_unhandled_extension: if ``True`` (the default), do not
            upload this file if Overview doesn't support its filename extension
            (for instance, ``".dbf"``).
        :param bool skip_duplicate: if ``True`` (the default), do not upload
            this file if your api_token points to a document set that already
            contains a file whose sha1 hash is identical to this file's. Files
            that have been sent without a call to ``finish()`` will not be
            included in the check.
        :param dict metadata: Metadata to set on the document, or ``None``.
            The document set should have a metadata schema that corresponds to
            this document's metadata (or you can set the schema later).
        :param str sha1: SHA1 hash:to use in ``skip_duplicate()`` check, or
            ``None`` to calculate on the fly. If you set this and ``n_bytes``,
            this method will stream the file contents instead of caching them
            in memory.
        """
        if skip_unhandled_extension:
            # We go by filename, with a blacklist we know Overview doesn't handle (yet)
            path, ext = os.path.splitext(filename)
            if ext.lower() in ['.zip', '.msg', '.gif', '.jpg', '.png', '.tiff', '.tif', '.dbf']:
                self.logger.info('Skipping %s, Overview does not handle this format', filename)
                return

        if skip_duplicate:
            if sha1 is None:
                # Cache in_file bytes in memory so we can read it twice: once in
                # is_file_already_in_document_set(), and once below.
                in_file = io.BytesIO(in_file.read())
                sha1 = _calculate_sha1(in_file)
                in_file.seek(0)

            if self.is_file_already_in_document_set(in_file, sha1):
                self.logger.info('Skipping %s, already on server', filename)
                return

        if n_bytes is None:
            # Cache in_file bytes in memory so we can read it twice: once 
            # here, once below
            in_file = io.BytesIO(in_file.read())
            n_bytes = in_file.getbuffer().nbytes

        server_path = '/api/v1/files/{}'.format(uuid.uuid4())
        headers = {
            'Content-Disposition': rfc6266.build_header(filename),
            'Content-Length': str(n_bytes),
        }
        if metadata:
            headers['Overview-Document-Metadata-JSON'] = json.dumps(metadata, ensure_ascii=True)

        self.logger.info('Uploading %s…', filename)
        r = self._request('POST', server_path, headers=headers, data=in_file)
        r.raise_for_status()
        self.n_uploaded += 1

[docs]    def is_file_already_in_document_set(self, in_file, sha1=None):
        """Return True iff the document set contains an identical file.

        This works by calculating the SHA1 and asking Overview whether it's been
        seen before in our document set. Files sent without a call to
        ``finish()`` will not be included in this check.

        :param io.BytesIO in_file: bytes to upload to Overview.
        :param str sha1: if set, assume the given SHA1 hash instead of computing
            it by reading the file. If ``None`` (the default), then ``in_file``
            will be read completely.
        """
        if sha1 is None:
            sha1 = _calculate_sha1(in_file)

        r = self._request('HEAD', '/api/v1/document-sets/files/{}'.format(sha1))

        if r.status_code == 204:
            return True
        elif r.status_code == 404:
            return False
        else:
            r.raise_for_status()

[docs]    def finish(self, lang='en', ocr=True, split_by_page=False):
        """Adds sent files to the document set.

        :param str lang: ISO language code for Overview's analysis (default is
            ``"en"``)
        :param bool ocr: if ``True`` (the default), tell Overview to read text
            from PDF pages that contain only images.
        :param bool split_by_page: if ``True``, tell Overview to create a
            document per page of the input file. (This only applies to PDFs and
            LibreOffice-compatible documents.) If ``False`` (the default), tell
            Overview to create one document per uploaded file.
        """
        if self.n_uploaded == 0:
            self.logger.info('No files uploaded')
            return

        # http://docs.overviewproject.apiary.io/#reference/files/finish-uploading-files/add-files-to-document-set?console=1
        self.logger.info('Finishing…')
        r = self._request('POST', '/api/v1/files/finish', json={
            'lang': lang,
            'ocr': ocr,
            'split_documents': split_by_page,
        })
        r.raise_for_status()
        self.logger.info(
            'Finished uploading %d file(s). Browse to %s/documentsets to watch progress',
            self.n_uploaded,
            self.server_url
        )
Source code for overview_upload._upload

overview-upload

Navigation

Related Topics