webrecorder
diff --git a/‎.github/workflows/k3d-nightly-ci.yaml
+17-1 b/‎.github/workflows/k3d-nightly-ci.yaml
+17-1
diff --git a/‎backend/btrixcloud/basecrawls.py
+13-49 b/‎backend/btrixcloud/basecrawls.py
+13-49
diff --git a/‎backend/btrixcloud/colls.py
+80-8 b/‎backend/btrixcloud/colls.py
+80-8
diff --git a/‎backend/btrixcloud/crawlconfigs.py
+26 b/‎backend/btrixcloud/crawlconfigs.py
+26
diff --git a/‎backend/btrixcloud/crawls.py
+1-3 b/‎backend/btrixcloud/crawls.py
+1-3
@@ -8,8 +8,24 @@ on:
   workflow_dispatch:
 
 jobs:
+  collect-test-modules:
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    steps:
+      - uses: actions/checkout@v3
+      - id: set-matrix
+        run: |
+          echo matrix="$(ls ./backend/test_nightly/ | grep -o "^test_.*" | jq -R -s -c 'split("\n")[:-1]')" >> $GITHUB_OUTPUT
+
   btrix-k3d-nightly-test:
+    name: ${{ matrix.module }}
+    needs: collect-test-modules
     runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        module: ${{fromJSON(needs.collect-test-modules.outputs.matrix)}}
+      fail-fast: false
     steps:
       - name: Create k3d Cluster
         uses: AbsaOSS/k3d-action@v2
@@ -82,7 +98,7 @@ jobs:
         run:  kubectl exec -i deployment/local-minio -c minio -- mkdir /data/replica-0
 
       - name: Run Tests
-        run: pytest -vv ./backend/test_nightly/test_*.py
+        run: pytest -vv ./backend/test_nightly/${{ matrix.module }}
 
       - name: Print Backend Logs (API)
         if: ${{ failure() }}
 
@@ -1,6 +1,6 @@
 """base crawl type"""
 
-from datetime import datetime, timedelta
+from datetime import datetime
 from typing import Optional, List, Union, Dict, Any, Type, TYPE_CHECKING, cast, Tuple
 from uuid import UUID
 import os
@@ -29,10 +29,9 @@
     UpdatedResponse,
     DeletedResponseQuota,
     CrawlSearchValuesResponse,
-    PRESIGN_DURATION_SECONDS,
 )
 from .pagination import paginated_format, DEFAULT_PAGE_SIZE
-from .utils import dt_now, date_to_str, get_origin
+from .utils import dt_now, get_origin, date_to_str
 
 if TYPE_CHECKING:
     from .crawlconfigs import CrawlConfigOps
@@ -65,9 +64,6 @@ class BaseCrawlOps:
     background_job_ops: BackgroundJobOps
     page_ops: PageOps
 
-    presign_duration_seconds: int
-    expire_at_duration_seconds: int
-
     def __init__(
         self,
         mdb,
@@ -89,9 +85,6 @@ def __init__(
         self.background_job_ops = background_job_ops
         self.page_ops = cast(PageOps, None)
 
-        # renew when <25% of time remaining
-        self.expire_at_duration_seconds = int(PRESIGN_DURATION_SECONDS * 0.75)
-
     def set_page_ops(self, page_ops):
         """set page ops reference"""
         self.page_ops = page_ops
@@ -124,13 +117,12 @@ async def _files_to_resources(
         files: List[Dict],
         org: Organization,
         crawlid: str,
-        qa_run_id: Optional[str] = None,
     ) -> List[CrawlFileOut]:
         if not files:
             return []
 
         crawl_files = [CrawlFile(**data) for data in files]
-        return await self.resolve_signed_urls(crawl_files, org, crawlid, qa_run_id)
+        return await self.resolve_signed_urls(crawl_files, org, crawlid)
 
     async def get_wacz_files(self, crawl_id: str, org: Organization):
         """Return list of WACZ files associated with crawl."""
@@ -177,11 +169,14 @@ async def get_crawl_out(
 
                 oid = res.get("oid")
                 if oid:
+                    origin = get_origin(headers)
                     res["pagesQueryUrl"] = (
-                        get_origin(headers)
-                        + f"/api/orgs/{oid}/crawls/{crawlid}/pagesSearch"
+                        origin + f"/api/orgs/{oid}/crawls/{crawlid}/pagesSearch"
                     )
 
+                # this will now disable the downloadUrl in RWP
+                res["downloadUrl"] = None
+
         crawl = CrawlOutWithResources.from_dict(res)
 
         if not skip_resources:
@@ -464,50 +459,19 @@ async def resolve_signed_urls(
         files: List[CrawlFile],
         org: Organization,
         crawl_id: Optional[str] = None,
-        qa_run_id: Optional[str] = None,
-        update_presigned_url: bool = False,
+        force_update=False,
     ) -> List[CrawlFileOut]:
         """Regenerate presigned URLs for files as necessary"""
         if not files:
             print("no files")
             return []
 
-        delta = timedelta(seconds=self.expire_at_duration_seconds)
-
         out_files = []
 
         for file_ in files:
-            presigned_url = file_.presignedUrl
-            now = dt_now()
-
-            if (
-                update_presigned_url
-                or not presigned_url
-                or (file_.expireAt and now >= file_.expireAt)
-            ):
-                exp = now + delta
-                presigned_url = await self.storage_ops.get_presigned_url(
-                    org, file_, PRESIGN_DURATION_SECONDS
-                )
-
-                prefix = "files"
-                if qa_run_id:
-                    prefix = f"qaFinished.{qa_run_id}.{prefix}"
-
-                await self.crawls.find_one_and_update(
-                    {f"{prefix}.filename": file_.filename},
-                    {
-                        "$set": {
-                            f"{prefix}.$.presignedUrl": presigned_url,
-                            f"{prefix}.$.expireAt": exp,
-                        }
-                    },
-                )
-                file_.expireAt = exp
-
-            expire_at_str = ""
-            if file_.expireAt:
-                expire_at_str = date_to_str(file_.expireAt)
+            presigned_url, expire_at = await self.storage_ops.get_presigned_url(
+                org, file_, force_update=force_update
+            )
 
             out_files.append(
                 CrawlFileOut(
@@ -517,7 +481,7 @@ async def resolve_signed_urls(
                     size=file_.size,
                     crawlId=crawl_id,
                     numReplicas=len(file_.replicas) if file_.replicas else 0,
-                    expireAt=expire_at_str,
+                    expireAt=date_to_str(expire_at),
                 )
             )
 
 
@@ -11,6 +11,7 @@
 
 import asyncio
 import pymongo
+import aiohttp
 from pymongo.collation import Collation
 from fastapi import Depends, HTTPException, Response
 from fastapi.responses import StreamingResponse
@@ -342,9 +343,11 @@ async def get_collection_out(
         result = await self.get_collection_raw(coll_id, public_or_unlisted_only)
 
         if resources:
-            result["resources"], crawl_ids, pages_optimized = (
-                await self.get_collection_crawl_resources(coll_id)
-            )
+            (
+                result["resources"],
+                crawl_ids,
+                pages_optimized,
+            ) = await self.get_collection_crawl_resources(coll_id)
 
             initial_pages, _ = await self.page_ops.list_pages(
                 crawl_ids=crawl_ids,
@@ -353,11 +356,21 @@ async def get_collection_out(
 
             public = "public/" if public_or_unlisted_only else ""
 
+            origin = get_origin(headers)
+
+            if public_or_unlisted_only:
+                slug = result.get("slug")
+                result["downloadUrl"] = (
+                    origin + f"/api/{public}orgs/{org.slug}/collections/{slug}/download"
+                )
+            else:
+                # disable download link, as not public without auth
+                result["downloadUrl"] = None
+
             if pages_optimized:
                 result["initialPages"] = initial_pages
                 result["pagesQueryUrl"] = (
-                    get_origin(headers)
-                    + f"/api/orgs/{org.id}/collections/{coll_id}/{public}pages"
+                    origin + f"/api/orgs/{org.id}/collections/{coll_id}/{public}pages"
                 )
 
         thumbnail = result.get("thumbnail")
@@ -378,6 +391,9 @@ async def get_public_collection_out(
         """Get PublicCollOut by id"""
         result = await self.get_collection_raw(coll_id)
 
+        result["orgName"] = org.name
+        result["orgPublicProfile"] = org.enablePublicProfile
+
         allowed_access = [CollAccessType.PUBLIC]
         if allow_unlisted:
             allowed_access.append(CollAccessType.UNLISTED)
@@ -396,6 +412,38 @@ async def get_public_collection_out(
 
         return PublicCollOut.from_dict(result)
 
+    async def get_public_thumbnail(
+        self, slug: str, org: Organization
+    ) -> StreamingResponse:
+        """return thumbnail of public collection, if any"""
+        result = await self.get_collection_raw_by_slug(
+            slug, public_or_unlisted_only=True
+        )
+
+        thumbnail = result.get("thumbnail")
+        if not thumbnail:
+            raise HTTPException(status_code=404, detail="thumbnail_not_found")
+
+        image_file = ImageFile(**thumbnail)
+        image_file_out = await image_file.get_public_image_file_out(
+            org, self.storage_ops
+        )
+
+        path = self.storage_ops.resolve_internal_access_path(image_file_out.path)
+
+        async def reader():
+            async with aiohttp.ClientSession() as session:
+                async with session.get(path) as resp:
+                    async for chunk in resp.content.iter_chunked(4096):
+                        yield chunk
+
+        headers = {
+            "Cache-Control": "max-age=3600, stale-while-revalidate=86400",
+            "Content-Length": f"{image_file.size}",
+            "Etag": f'"{image_file.hash}"',
+        }
+        return StreamingResponse(reader(), media_type=image_file.mime, headers=headers)
+
     async def list_collections(
         self,
         org: Organization,
@@ -497,6 +545,9 @@ async def list_collections(
                         org, self.storage_ops
                     )
 
+            res["orgName"] = org.name
+            res["orgPublicProfile"] = org.enablePublicProfile
+
             if public_colls_out:
                 collections.append(PublicCollOut.from_dict(res))
             else:
@@ -839,6 +890,7 @@ async def stream_iter():
             file_prep.upload_name,
             stream_iter(),
             MIN_UPLOAD_PART_SIZE,
+            mime=file_prep.mime,
         ):
             print("Collection thumbnail stream upload failed", flush=True)
             raise HTTPException(status_code=400, detail="upload_failed")
@@ -962,9 +1014,11 @@ async def get_collection_all(org: Organization = Depends(org_viewer_dep)):
         try:
             all_collections, _ = await colls.list_collections(org, page_size=10_000)
             for collection in all_collections:
-                results[collection.name], _, _ = (
-                    await colls.get_collection_crawl_resources(collection.id)
-                )
+                (
+                    results[collection.name],
+                    _,
+                    _,
+                ) = await colls.get_collection_crawl_resources(collection.id)
         except Exception as exc:
             # pylint: disable=raise-missing-from
             raise HTTPException(
@@ -1162,6 +1216,24 @@ async def download_public_collection(
 
         return await colls.download_collection(coll.id, org)
 
+    @app.get(
+        "/public/orgs/{org_slug}/collections/{coll_slug}/thumbnail",
+        tags=["collections", "public"],
+        response_class=StreamingResponse,
+    )
+    async def get_public_thumbnail(
+        org_slug: str,
+        coll_slug: str,
+    ):
+        try:
+            org = await colls.orgs.get_org_by_slug(org_slug)
+        # pylint: disable=broad-exception-caught
+        except Exception:
+            # pylint: disable=raise-missing-from
+            raise HTTPException(status_code=404, detail="collection_not_found")
+
+        return await colls.get_public_thumbnail(coll_slug, org)
+
     @app.post(
         "/orgs/{oid}/collections/{coll_id}/home-url",
         tags=["collections"],
 
@@ -36,6 +36,7 @@
     CrawlerChannel,
     CrawlerChannels,
     StartedResponse,
+    SuccessResponse,
     CrawlConfigAddedResponse,
     CrawlConfigSearchValues,
     CrawlConfigUpdateResponse,
@@ -1036,6 +1037,21 @@ def get_warc_prefix(self, org: Organization, crawlconfig: CrawlConfig) -> str:
         prefix = org.slug + "-" + name
         return prefix[:80]
 
+    async def re_add_all_scheduled_cron_jobs(self):
+        """Re-add all scheduled workflow cronjobs"""
+        match_query = {"schedule": {"$nin": ["", None]}, "inactive": {"$ne": True}}
+        async for config_dict in self.crawl_configs.find(match_query):
+            config = CrawlConfig.from_dict(config_dict)
+            try:
+                await self.crawl_manager.update_scheduled_job(config)
+                print(f"Updated cronjob for scheduled workflow {config.id}", flush=True)
+            # pylint: disable=broad-except
+            except Exception as err:
+                print(
+                    f"Error updating cronjob for scheduled workflow {config.id}: {err}",
+                    flush=True,
+                )
+
 
 # ============================================================================
 # pylint: disable=too-many-locals
@@ -1272,6 +1288,16 @@ async def make_inactive(cid: UUID, org: Organization = Depends(org_crawl_dep)):
 
         return await ops.do_make_inactive(crawlconfig)
 
+    @app.post("/orgs/all/crawlconfigs/reAddCronjobs", response_model=SuccessResponse)
+    async def re_add_all_scheduled_cron_jobs(
+        user: User = Depends(user_dep),
+    ):
+        if not user.is_superuser:
+            raise HTTPException(status_code=403, detail="Not Allowed")
+
+        asyncio.create_task(ops.re_add_all_scheduled_cron_jobs())
+        return {"success": True}
+
     org_ops.router.include_router(router)
 
     return ops
@@ -1026,9 +1026,7 @@ async def get_qa_run_for_replay(
             if not org:
                 raise HTTPException(status_code=400, detail="missing_org")
 
-        resources = await self.resolve_signed_urls(
-            qa_run.files, org, crawl.id, qa_run_id
-        )
+        resources = await self.resolve_signed_urls(qa_run.files, org, crawl.id)
 
         qa_run.files = []