Skip to content

Commit

Permalink
Merge pull request #3970 from kobotoolbox/3967-paired-data-slow-perfo…
Browse files Browse the repository at this point in the history
…rmance

Generate the external XML data of connected parent projects faster
  • Loading branch information
jnm authored Aug 16, 2022
2 parents a9556ec + 221566f commit a86a6d9
Show file tree
Hide file tree
Showing 7 changed files with 75 additions and 12 deletions.
3 changes: 2 additions & 1 deletion kobo/apps/reports/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,5 @@
DEFAULT_REPORTS_KEY = 'default'

FUZZY_VERSION_ID_KEY = '_version_'
INFERRED_VERSION_ID_KEY = '__inferred_version__'
FUZZY_VERSION_PATTERN = r'^__?version__?(\d{3})?$'
INFERRED_VERSION_ID_KEY = '__inferred_version__'
11 changes: 9 additions & 2 deletions kpi/models/paired_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ def __init__(
self.paired_data_uid = paired_data_uid

self._asset_file = None
self._source_asset = None

def __str__(self):
return f'<PairedData {self.paired_data_uid} ({self.filename})>'
Expand Down Expand Up @@ -156,7 +157,11 @@ def get_download_url(self, request):
args=(self.asset.uid, self.paired_data_uid, 'xml'),
request=request)

def get_source(self) -> Union['Asset', None]:
def get_source(self, force: bool = False) -> Union['Asset', None]:
# if `self._source_asset` has been already set once, use the cache
# object instead of fetching it from DB again.
if not force and self._source_asset:
return self._source_asset

# Avoid circular import
Asset = self.asset.__class__ # noqa
Expand All @@ -183,7 +188,9 @@ def get_source(self) -> Union['Asset', None]:
):
return None

return source_asset
self._source_asset = source_asset

return self._source_asset

@property
def md5_hash(self):
Expand Down
20 changes: 18 additions & 2 deletions kpi/serializers/v2/asset.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# coding: utf-8
import json
import re

from django.conf import settings
from django.utils.translation import gettext as t
Expand All @@ -8,6 +9,7 @@
from rest_framework.reverse import reverse
from rest_framework.utils.serializer_helpers import ReturnList

from kobo.apps.reports.constants import FUZZY_VERSION_PATTERN
from kobo.apps.reports.report_data import build_formpack
from kpi.constants import (
ASSET_STATUS_DISCOVERABLE,
Expand Down Expand Up @@ -498,18 +500,32 @@ def validate_data_sharing(self, data_sharing: dict) -> dict:
else:
asset = self.instance
fields = data_sharing['fields']
form_pack, _unused = build_formpack(asset, submission_stream=[])
# We used to get all fields for every version for valid fields,
# but the UI shows the latest version only, so only its fields
# can be picked up. It is easier then to compare valid fields with
# user's choice.
form_pack, _unused = build_formpack(
asset, submission_stream=[], use_all_form_versions=False
)
# We do not want to include the version field.
# See `_infer_version_id()` in `kobo.apps.reports.report_data.build_formpack`
# for field name alternatives.
valid_fields = [
f.path for f in form_pack.get_fields_for_versions(
form_pack.versions.keys()
)
) if not re.match(FUZZY_VERSION_PATTERN, f.path)
]
unknown_fields = set(fields) - set(valid_fields)
if unknown_fields and valid_fields:
errors['fields'] = t(
'Some fields are invalid, '
'choices are: `{valid_fields}`'
).format(valid_fields='`,`'.join(valid_fields))

# Force `fields` to be an empty list to avoid useless parsing when
# fetching external xml endpoint (i.e.: /api/v2/assets/<asset_uid>/paired-data/<paired_data_uid>/external.xml)
if sorted(valid_fields) == sorted(fields):
data_sharing['fields'] = []
else:
data_sharing['fields'] = []

Expand Down
19 changes: 17 additions & 2 deletions kpi/serializers/v2/paired_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from rest_framework import serializers
from rest_framework.reverse import reverse

from kobo.apps.reports.constants import FUZZY_VERSION_PATTERN
from kobo.apps.reports.report_data import build_formpack
from kpi.constants import (
ASSET_TYPE_SURVEY,
Expand Down Expand Up @@ -124,11 +125,20 @@ def _validate_fields(self, attrs: dict):
return

source = attrs['source']
form_pack, _unused = build_formpack(source, submission_stream=[])
# We used to get all fields for every version for valid fields,
# but the UI shows the latest version only, so only its fields
# can be picked up. It is easier then to compare valid fields with
# user's choice.
form_pack, _unused = build_formpack(
source, submission_stream=[], use_all_form_versions=False
)
# We do not want to include the version field.
# See `_infer_version_id()` in `kobo.apps.reports.report_data.build_formpack`
# for field name alternatives.
valid_fields = [
f.path for f in form_pack.get_fields_for_versions(
form_pack.versions.keys()
)
) if not re.match(FUZZY_VERSION_PATTERN, f.path)
]

source_fields = source.data_sharing.get('fields') or valid_fields
Expand All @@ -145,6 +155,11 @@ def _validate_fields(self, attrs: dict):
}
)

# Force `posted_fields` to be an empty list to avoid useless parsing when
# fetching external xml endpoint (i.e.: /api/v2/assets/<asset_uid>/paired-data/<paired_data_uid>/external.xml)
if sorted(valid_fields) == sorted(posted_fields):
posted_fields = []

attrs['fields'] = posted_fields

def _validate_filename(self, attrs: dict):
Expand Down
4 changes: 2 additions & 2 deletions kpi/tests/test_paired_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ def test_cannot_retrieve_source_if_source_stop_data_sharing(self):
self.assertEqual(source, self.source_asset)
self.source_asset.data_sharing['enabled'] = False
self.source_asset.save()
source = self.paired_data.get_source()
source = self.paired_data.get_source(force=True)
self.assertEqual(source, None)

def test_cannot_retrieve_source_if_source_remove_perm(self):
Expand All @@ -134,5 +134,5 @@ def test_cannot_retrieve_source_if_source_remove_perm(self):
self.source_asset.remove_perm(
self.paired_data.asset.owner, PERM_VIEW_SUBMISSIONS
)
source = self.paired_data.get_source()
source = self.paired_data.get_source(force=True)
self.assertEqual(source, None)
24 changes: 22 additions & 2 deletions kpi/utils/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,26 @@
from typing import Optional, Union, List
from lxml import etree

from django_request_cache import cache_for_request
from shortuuid import ShortUUID


def strip_nodes(
source: Union[str, bytes],
nodes_to_keep: list,
use_xpath: bool = False,
xml_declaration: bool = False,
rename_root_node_to: Optional[str] = None,
bulk_action_cache_key: str = None,
) -> str:
"""
Returns a stripped version of `source`. It keeps only nodes provided in
`nodes_to_keep`.
If `rename_root_node_to` is provided, the root node will be renamed to the
value of that parameter in the returned XML string.
A random string can be passed to `bulk_action_cache_key` to get the
XPaths only once if calling `strip_nodes()` several times in a loop.
"""
# Force `source` to be bytes in case it contains an XML declaration
# `etree` does not support strings with xml declarations.
Expand All @@ -28,7 +35,13 @@ def strip_nodes(
root_element = tree.getroot()
root_path = tree.getpath(root_element)

def get_xpath_matches():
# `@cache_for_request` uses the parameters of the function it decorates
# to generate the key under which the returned value of the function is
# stored for cache purpose.
# `cache_key` is only there to serve that purpose and ensure
# `@cache_for_request` uniqueness.
@cache_for_request
def get_xpath_matches(cache_key: str):
if use_xpath:
xpaths_ = []
for xpath_ in nodes_to_keep:
Expand Down Expand Up @@ -118,7 +131,14 @@ def remove_root_path(path_: str) -> str:
return path_.replace(root_path, '')

if len(nodes_to_keep):
xpath_matches = get_xpath_matches()
# Always sends an unique string to `get_xpath_matches()`
# See comments above the function
if bulk_action_cache_key is None:
cache_key = ShortUUID().random(24)
else:
cache_key = bulk_action_cache_key

xpath_matches = get_xpath_matches(cache_key=cache_key)
process_node(root_element, xpath_matches)

if rename_root_node_to:
Expand Down
6 changes: 5 additions & 1 deletion kpi/views/v2/paired_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from rest_framework.decorators import action
from rest_framework.response import Response
from rest_framework_extensions.mixins import NestedViewSetMixin
from shortuuid import ShortUUID

from kpi.constants import SUBMISSION_FORMAT_TYPE_XML
from kpi.models import Asset, AssetFile, PairedData
Expand Down Expand Up @@ -252,6 +253,8 @@ def external(self, request, paired_data_uid, **kwargs):
format_type=SUBMISSION_FORMAT_TYPE_XML
)
parsed_submissions = []
allowed_fields = paired_data.allowed_fields
random_uuid = ShortUUID().random(24)

for submission in submissions:
# Use `rename_root_node_to='data'` to rename the root node of each
Expand All @@ -263,9 +266,10 @@ def external(self, request, paired_data_uid, **kwargs):
parsed_submissions.append(
strip_nodes(
submission,
paired_data.allowed_fields,
allowed_fields,
use_xpath=True,
rename_root_node_to='data',
bulk_action_cache_key=random_uuid,
)
)

Expand Down

0 comments on commit a86a6d9

Please sign in to comment.