Source code for upload.views

import csv
import pandas as pd
import subprocess
import json
import io
import re

from flask import Blueprint, request, current_app as app, jsonify

bp = Blueprint('upload', __name__)


@bp.route('/upload_file/', methods=['POST'])
[docs]def upload_file():
    """
    POST /upload_file/
        Upload, validate and process a file, and send it to CompleteSearch.

    :param use_first_row: use the first data row as a header.
        If the parameter is ``False``, the column names will be generated
        automatically (i.e. Column1, Column2, etc.).

    :param file: the uploading file

    :returns: dictionary with dataset settings, e.g. facet/filter fields,
        which fields to use for the full-text search, etc.

    :rtype: JSON response
    """
    num_cols = 40
    dialect = None
    result = {}
    error = ''

    header_row = True if request.form.get('use_first_row', 'true') == 'true' \
        else False

    def create_header(cols):
        return ['Column' + str(i + 1) for i in range(cols)] \
            if not header_row else None

    def create_dataframe(file, delimiter, names):
        return pd.read_csv(
            file,
            delimiter=delimiter,
            encoding='utf-8',
            engine='c',
            comment='#',
            error_bad_lines=False,
            dtype=object,
            names=names,
        )

    try:
        if 'file' not in request.files:
            raise ValueError('You did not select any file.')

        csv_file = request.files['file']
        if not allowed_file(csv_file.filename):
            raise ValueError('Wrong file type.')

        # Select non-empty lines to define a delimiter
        lines = []
        for line in csv_file:
            line = str(line, 'utf-8').strip()
            if not line.startswith('#') and line != '':
                lines.append(line)
            if len(lines) == 50:
                break
        csv_file.seek(0)

        if not any(lines):
            raise ValueError('Cannot define a delimiter.')

        # Define the delimiter
        dialect = csv.Sniffer().sniff('\n'.join(lines), delimiters=',;#$|\t')

        # Create a list with column names
        header = create_header(num_cols)

        csv_file_string = str(csv_file.read(), 'utf-8')
        data = create_dataframe(io.StringIO(csv_file_string),
                                dialect.delimiter, header)

        # Extend the number of columns and re-create the DataFrame
        if list(data.isnull().all()).count(True) == 0:
            num_cols += 20
            header = create_header(num_cols)
            # data = create_dataframe(new_file, dialect.delimiter, header)
            data = create_dataframe(io.StringIO(csv_file_string),
                                    dialect.delimiter, header)

        # Remove extra columns with all empty rows
        data = data.dropna(axis=1, how='all')

        if data.empty:
            raise ValueError('Cannot process the uploaded file. '
                             'Please make sure the header row contains data '
                             'in all columns (fields) if you selected to use '
                             'the first row as the header.')
        if header_row:
            # Remove all spaces in the columns
            header = {c: re.sub(r'\s+', '', c) for c in data.columns}
            data = data.rename(columns=header)

            for field in header:
                if field == '' or field.lower().startswith('xml') \
                        or not field[0].isalpha():
                    raise ValueError('Cannot process the uploaded file. '
                                     'Please make sure each column (field) '
                                     'in the header row starts with a letter '
                                     'and doesn\'t start with "xml".')

        # data, facets_fields = process_csv(csv_file, dialect.delimiter)
        facets_fields = define_facets(data)
        # facets_fields = sorted(facets_fields)
        facets_fields_str = ','.join(facets_fields)
        all_fields = data.columns.values.tolist()
        all_fields_str = ','.join(all_fields)

        result = {
            'database_uploaded': True,
            'all_fields': all_fields,
            'full_text': all_fields,
            'show': facets_fields,
            'facets': facets_fields,
            'filter': facets_fields,
        }

        # Save the processed file
        data.to_csv(
            app.config['OUTPUT_PATH'],
            sep='\t',
            escapechar='\\',  # test this
            index=False
        )

        # Don't run this code with TestingConfig
        if not app.config['TESTING']:  # pragma: no cover
            opts = '--within-field-separator=\\; ' + \
                   '--full-text=%s ' % all_fields_str + \
                   '--show=%s ' % facets_fields_str + \
                   '--filter=%s ' % facets_fields_str + \
                   '--facets=%s' % facets_fields_str

            command = 'make OPTIONS="%s" pclean-all process_input' % opts

            # Process the input
            out, err = subprocess.Popen([command], shell=True,
                                        stdout=subprocess.PIPE,
                                        stderr=subprocess.PIPE, ).communicate()

            cmd_error = str(err, 'utf-8')
            if '[process_input] Error' in cmd_error:
                app.logger.debug('[Process input]:\n%s' % cmd_error)
                errors = set()
                for err_line in cmd_error.split('\n'):
                    if err_line != '' and not err_line.startswith('make') \
                            and not err_line.startswith('sort'):
                        errors.add(err_line)
                error = '<br/>'.join(list(errors))

    except Exception as e:
        error = str(e)
        app.logger.exception(e)

    return jsonify(success=not error, error=error, data=result)


@bp.route('/save_uploaded_dataset/', methods=['POST'])
[docs]def save_uploaded_dataset():
    """
    POST /save_uploaded_dataset/
        Save the ploaded dataset's settings and start the CompleteSearch server

    :param data: a dictionary with all dataset's settings, which have been
        generated by the ``upload_file`` function.

    :returns: dictionary with the ``success`` property and an ``error`` message

    :rtype: JSON response
    """
    settings = app.settings.to_dict()
    error = ''

    try:
        if not request.data:
            raise ValueError('Data is missing.')
        params = json.loads(str(request.data, 'utf-8'))

        # Save the settings
        settings.update(params)
        app.settings.save()

        # Start the server
        subprocess.Popen(['make start'], shell=True).communicate()

    except Exception as e:
        error = str(e)
        app.logger.exception(e)

    return jsonify(success=not error, error=error)


[docs]def allowed_file(filename):
    """
    Check if the uploading file's type is allowed.

    :param filename: filename

    :returns: result of the check

    :rtype: bool
    """
    return '.' in filename and \
        filename.rsplit('.', 1)[1].lower() in app.config['ALLOWED_EXTENSIONS']


[docs]def define_facets(data):
    """
    Define facets by their occurrence in the dataset.

    :param data: ``DataFrame`` with the uploaded dataset

    :returns: field names which will be used as facets

    :rtype: list
    """
    non_nan_rows = data.count()  # number of non-NaN rows in each column

    # Define good facets (columns which have more than one occurrence)
    facets = [
        {
            'name': column,
            'count': data[column].value_counts().size
        }
        for column in data
        if data[column].value_counts().size < non_nan_rows[column]
    ]
    facets = [x['name'] for x in sorted(facets, key=lambda x: x['count'])[:5]]

    return facets


# def process_csv(csv_file, delimiter):
#     """ Check the uploaded file (skip bad rows) and define facets. """
#     data = pd.read_csv(
#         csv_file,
#         delimiter=delimiter,
#         encoding='utf-8',
#         engine='c',
#         comment='#',
#         error_bad_lines=False,
#         dtype=object,
#     )

#     # Number of non-NaN rows in each column
#     non_nan_rows = data.count()

#     # Define good facets (columns which have more than one occurrence)
#     facets = [
#         {
#             'name': column,
#             'count': data[column].value_counts().size
#         }
#         for column in data
#         if data[column].value_counts().size < non_nan_rows[column]
#     ]
#     facets = [x['name'] for x in sorted(facets, key=lambda x: x['count'])[:5]]

#     return data, facets