Merge pull request #3970 from kobotoolbox/3967-paired-data-slow-perfo…

…rmance Generate the external XML data of connected parent projects faster
kobotoolbox · Aug 16, 2022 · a86a6d9 · a86a6d9
2 parents a9556ec + 221566f
commit a86a6d9
Show file tree

Hide file tree

Showing 7 changed files with 75 additions and 12 deletions.
diff --git a/kobo/apps/reports/constants.py b/kobo/apps/reports/constants.py
@@ -9,4 +9,5 @@
 DEFAULT_REPORTS_KEY = 'default'
 
 FUZZY_VERSION_ID_KEY = '_version_'
-INFERRED_VERSION_ID_KEY = '__inferred_version__'
+FUZZY_VERSION_PATTERN = r'^__?version__?(\d{3})?$'
+INFERRED_VERSION_ID_KEY = '__inferred_version__'
diff --git a/kpi/models/paired_data.py b/kpi/models/paired_data.py
@@ -66,6 +66,7 @@ def __init__(
             self.paired_data_uid = paired_data_uid
 
         self._asset_file = None
+        self._source_asset = None
 
     def __str__(self):
         return f'<PairedData {self.paired_data_uid} ({self.filename})>'
@@ -156,7 +157,11 @@ def get_download_url(self, request):
                        args=(self.asset.uid, self.paired_data_uid, 'xml'),
                        request=request)
 
-    def get_source(self) -> Union['Asset', None]:
+    def get_source(self, force: bool = False) -> Union['Asset', None]:
+        # if `self._source_asset` has been already set once, use the cache
+        # object instead of fetching it from DB again.
+        if not force and self._source_asset:
+            return self._source_asset
 
         # Avoid circular import
         Asset = self.asset.__class__  # noqa
@@ -183,7 +188,9 @@ def get_source(self) -> Union['Asset', None]:
         ):
             return None
 
-        return source_asset
+        self._source_asset = source_asset
+
+        return self._source_asset
 
     @property
     def md5_hash(self):

diff --git a/kpi/serializers/v2/asset.py b/kpi/serializers/v2/asset.py
@@ -1,5 +1,6 @@
 # coding: utf-8
 import json
+import re
 
 from django.conf import settings
 from django.utils.translation import gettext as t
@@ -8,6 +9,7 @@
 from rest_framework.reverse import reverse
 from rest_framework.utils.serializer_helpers import ReturnList
 
+from kobo.apps.reports.constants import FUZZY_VERSION_PATTERN
 from kobo.apps.reports.report_data import build_formpack
 from kpi.constants import (
     ASSET_STATUS_DISCOVERABLE,
@@ -498,18 +500,32 @@ def validate_data_sharing(self, data_sharing: dict) -> dict:
             else:
                 asset = self.instance
                 fields = data_sharing['fields']
-                form_pack, _unused = build_formpack(asset, submission_stream=[])
+                # We used to get all fields for every version for valid fields,
+                # but the UI shows the latest version only, so only its fields
+                # can be picked up. It is easier then to compare valid fields with
+                # user's choice.
+                form_pack, _unused = build_formpack(
+                    asset, submission_stream=[], use_all_form_versions=False
+                )
+                # We do not want to include the version field.
+                # See `_infer_version_id()` in `kobo.apps.reports.report_data.build_formpack`
+                # for field name alternatives.
                 valid_fields = [
                     f.path for f in form_pack.get_fields_for_versions(
                         form_pack.versions.keys()
-                    )
+                    ) if not re.match(FUZZY_VERSION_PATTERN, f.path)
                 ]
                 unknown_fields = set(fields) - set(valid_fields)
                 if unknown_fields and valid_fields:
                     errors['fields'] = t(
                         'Some fields are invalid, '
                         'choices are: `{valid_fields}`'
                     ).format(valid_fields='`,`'.join(valid_fields))
+
+                # Force `fields` to be an empty list to avoid useless parsing when
+                # fetching external xml endpoint (i.e.: /api/v2/assets/<asset_uid>/paired-data/<paired_data_uid>/external.xml)
+                if sorted(valid_fields) == sorted(fields):
+                    data_sharing['fields'] = []
         else:
             data_sharing['fields'] = []
 

diff --git a/kpi/serializers/v2/paired_data.py b/kpi/serializers/v2/paired_data.py
@@ -6,6 +6,7 @@
 from rest_framework import serializers
 from rest_framework.reverse import reverse
 
+from kobo.apps.reports.constants import FUZZY_VERSION_PATTERN
 from kobo.apps.reports.report_data import build_formpack
 from kpi.constants import (
     ASSET_TYPE_SURVEY,
@@ -124,11 +125,20 @@ def _validate_fields(self, attrs: dict):
             return
 
         source = attrs['source']
-        form_pack, _unused = build_formpack(source, submission_stream=[])
+        # We used to get all fields for every version for valid fields,
+        # but the UI shows the latest version only, so only its fields
+        # can be picked up. It is easier then to compare valid fields with
+        # user's choice.
+        form_pack, _unused = build_formpack(
+            source, submission_stream=[], use_all_form_versions=False
+        )
+        # We do not want to include the version field.
+        # See `_infer_version_id()` in `kobo.apps.reports.report_data.build_formpack`
+        # for field name alternatives.
         valid_fields = [
             f.path for f in form_pack.get_fields_for_versions(
                 form_pack.versions.keys()
-            )
+            ) if not re.match(FUZZY_VERSION_PATTERN, f.path)
         ]
 
         source_fields = source.data_sharing.get('fields') or valid_fields
@@ -145,6 +155,11 @@ def _validate_fields(self, attrs: dict):
                 }
             )
 
+        # Force `posted_fields` to be an empty list to avoid useless parsing when
+        # fetching external xml endpoint (i.e.: /api/v2/assets/<asset_uid>/paired-data/<paired_data_uid>/external.xml)
+        if sorted(valid_fields) == sorted(posted_fields):
+            posted_fields = []
+
         attrs['fields'] = posted_fields
 
     def _validate_filename(self, attrs: dict):

diff --git a/kpi/tests/test_paired_data.py b/kpi/tests/test_paired_data.py
@@ -125,7 +125,7 @@ def test_cannot_retrieve_source_if_source_stop_data_sharing(self):
         self.assertEqual(source, self.source_asset)
         self.source_asset.data_sharing['enabled'] = False
         self.source_asset.save()
-        source = self.paired_data.get_source()
+        source = self.paired_data.get_source(force=True)
         self.assertEqual(source, None)
 
     def test_cannot_retrieve_source_if_source_remove_perm(self):
@@ -134,5 +134,5 @@ def test_cannot_retrieve_source_if_source_remove_perm(self):
         self.source_asset.remove_perm(
             self.paired_data.asset.owner, PERM_VIEW_SUBMISSIONS
         )
-        source = self.paired_data.get_source()
+        source = self.paired_data.get_source(force=True)
         self.assertEqual(source, None)
diff --git a/kpi/utils/xml.py b/kpi/utils/xml.py
@@ -3,19 +3,26 @@
 from typing import Optional, Union, List
 from lxml import etree
 
+from django_request_cache import cache_for_request
+from shortuuid import ShortUUID
+
 
 def strip_nodes(
     source: Union[str, bytes],
     nodes_to_keep: list,
     use_xpath: bool = False,
     xml_declaration: bool = False,
     rename_root_node_to: Optional[str] = None,
+    bulk_action_cache_key: str = None,
 ) -> str:
     """
     Returns a stripped version of `source`. It keeps only nodes provided in
     `nodes_to_keep`.
     If `rename_root_node_to` is provided, the root node will be renamed to the
     value of that parameter in the returned XML string.
+
+    A random string can be passed to `bulk_action_cache_key` to get the
+    XPaths only once if calling `strip_nodes()` several times in a loop.
     """
     # Force `source` to be bytes in case it contains an XML declaration
     # `etree` does not support strings with xml declarations.
@@ -28,7 +35,13 @@ def strip_nodes(
     root_element = tree.getroot()
     root_path = tree.getpath(root_element)
 
-    def get_xpath_matches():
+    # `@cache_for_request` uses the parameters of the function it decorates
+    # to generate the key under which the returned value of the function is
+    # stored for cache purpose.
+    # `cache_key` is only there to serve that purpose and ensure
+    # `@cache_for_request` uniqueness.
+    @cache_for_request
+    def get_xpath_matches(cache_key: str):
         if use_xpath:
             xpaths_ = []
             for xpath_ in nodes_to_keep:
@@ -118,7 +131,14 @@ def remove_root_path(path_: str) -> str:
         return path_.replace(root_path, '')
 
     if len(nodes_to_keep):
-        xpath_matches = get_xpath_matches()
+        # Always sends an unique string to `get_xpath_matches()`
+        # See comments above the function
+        if bulk_action_cache_key is None:
+            cache_key = ShortUUID().random(24)
+        else:
+            cache_key = bulk_action_cache_key
+
+        xpath_matches = get_xpath_matches(cache_key=cache_key)
         process_node(root_element, xpath_matches)
 
     if rename_root_node_to:

diff --git a/kpi/views/v2/paired_data.py b/kpi/views/v2/paired_data.py
@@ -7,6 +7,7 @@
 from rest_framework.decorators import action
 from rest_framework.response import Response
 from rest_framework_extensions.mixins import NestedViewSetMixin
+from shortuuid import ShortUUID
 
 from kpi.constants import SUBMISSION_FORMAT_TYPE_XML
 from kpi.models import Asset, AssetFile, PairedData
@@ -252,6 +253,8 @@ def external(self, request, paired_data_uid, **kwargs):
             format_type=SUBMISSION_FORMAT_TYPE_XML
         )
         parsed_submissions = []
+        allowed_fields = paired_data.allowed_fields
+        random_uuid = ShortUUID().random(24)
 
         for submission in submissions:
             # Use `rename_root_node_to='data'` to rename the root node of each
@@ -263,9 +266,10 @@ def external(self, request, paired_data_uid, **kwargs):
             parsed_submissions.append(
                 strip_nodes(
                     submission,
-                    paired_data.allowed_fields,
+                    allowed_fields,
                     use_xpath=True,
                     rename_root_node_to='data',
+                    bulk_action_cache_key=random_uuid,
                 )
             )