Add safety mechanism for direct object deletion

The main objective is prevent deletion of objects that can leave
unreachable orphan objects which we are unable to correctly track.

Additionally, this commit includes:

1. Properly implement safe cascade deletion of all participating
   tables on soft deletion in the objects-gc task;

2. Make the file thumbnail related tables also participate in the
   touch/refcount mechanism applyign to the same safety checks;

3. Add helper for db query lazy iteration using PostgreSQL support
   for server side cursors;

4. Fix efficiency issues on gc related task using server side
   cursors instead of custom chunked iteration for processing data.

   The problem resided when a large chunk of rows that has identical
   value on the deleted_at column and the chunk size is small (the
   default); when the custom chunked iteration only reads a first N
   items and skip the rest of the set to the next run.

   This has caused many objects to remain pending to be eliminated,
   taking up space for longer than expected. The server side cursor
   based iteration does not has this problem and iterates correctly
   over all objects.

5. Fix refcount issues on font variant deletion RPC methods
This commit is contained in:
Andrey Antukh 2023-12-29 15:21:14 +01:00
parent e6fb96c4c2
commit addb392ecc
37 changed files with 1918 additions and 1026 deletions

View file

@ -10,7 +10,6 @@
file is eligible to be garbage collected after some period of
inactivity (the default threshold is 72h)."
(:require
[app.common.data :as d]
[app.common.files.migrations :as pmg]
[app.common.logging :as l]
[app.common.thumbnails :as thc]
@ -30,7 +29,7 @@
[integrant.core :as ig]))
(declare ^:private get-candidates)
(declare ^:private process-file)
(declare ^:private clean-file!)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; HANDLER
@ -44,67 +43,61 @@
(assoc cfg ::min-age cf/deletion-delay))
(defmethod ig/init-key ::handler
[_ {:keys [::db/pool] :as cfg}]
[_ cfg]
(fn [{:keys [file-id] :as params}]
(db/tx-run! cfg
(fn [{:keys [::db/conn] :as cfg}]
(let [min-age (dt/duration (or (:min-age params) (::min-age cfg)))
cfg (-> cfg
(update ::sto/storage media/configure-assets-storage conn)
(assoc ::file-id file-id)
(assoc ::min-age min-age))
(db/with-atomic [conn pool]
(let [min-age (dt/duration (or (:min-age params) (::min-age cfg)))
cfg (-> cfg
(update ::sto/storage media/configure-assets-storage conn)
(assoc ::db/conn conn)
(assoc ::file-id file-id)
(assoc ::min-age min-age))
total (reduce (fn [total file]
(clean-file! cfg file)
(inc total))
0
(get-candidates cfg))]
total (reduce (fn [total file]
(process-file cfg file)
(inc total))
0
(get-candidates cfg))]
(l/inf :hint "task finished"
:min-age (dt/format-duration min-age)
:processed total)
(l/info :hint "task finished" :min-age (dt/format-duration min-age) :processed total)
;; Allow optional rollback passed by params
(when (:rollback? params)
(db/rollback! conn))
;; Allow optional rollback passed by params
(when (:rollback? params)
(db/rollback! conn))
{:processed total}))))
{:processed total})))))
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; IMPL
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(def ^:private
sql:get-candidates-chunk
"select f.id,
sql:get-candidates
"SELECT f.id,
f.data,
f.revn,
f.features,
f.modified_at
from file as f
where f.has_media_trimmed is false
and f.modified_at < now() - ?::interval
and f.modified_at < ?
order by f.modified_at desc
limit 1
for update skip locked")
FROM file AS f
WHERE f.has_media_trimmed IS false
AND f.modified_at < now() - ?::interval
ORDER BY f.modified_at DESC
FOR UPDATE
SKIP LOCKED")
(defn- get-candidates
[{:keys [::db/conn ::min-age ::file-id]}]
(if (uuid? file-id)
(do
(l/warn :hint "explicit file id passed on params" :file-id file-id)
(l/warn :hint "explicit file id passed on params" :file-id (str file-id))
(->> (db/query conn :file {:id file-id})
(map #(update % :features db/decode-pgarray #{}))))
(let [interval (db/interval min-age)
get-chunk (fn [cursor]
(let [rows (db/exec! conn [sql:get-candidates-chunk interval cursor])]
[(some->> rows peek :modified-at)
(map #(update % :features db/decode-pgarray #{}) rows)]))]
(d/iteration get-chunk
:vf second
:kf first
:initk (dt/now)))))
(let [min-age (db/interval min-age)]
(->> (db/cursor conn [sql:get-candidates min-age] {:chunk-size 1})
(map #(update % :features db/decode-pgarray #{}))))))
(defn collect-used-media
"Given a fdata (file data), returns all media references."
@ -134,101 +127,93 @@
(into xform pages)
(into (keys (:media data))))))
(def ^:private sql:mark-file-media-object-deleted
"UPDATE file_media_object
SET deleted_at = now()
WHERE file_id = ? AND id != ALL(?::uuid[])
RETURNING id")
(defn- clean-file-media!
"Performs the garbage collection of file media objects."
[conn file-id data]
(let [used (collect-used-media data)
unused (->> (db/query conn :file-media-object {:file-id file-id})
(remove #(contains? used (:id %))))]
ids (db/create-array conn "uuid" used)
unused (->> (db/exec! conn [sql:mark-file-media-object-deleted file-id ids])
(into #{} (map :id)))]
(doseq [mobj unused]
(l/dbg :hint "delete file media object"
:id (:id mobj)
:media-id (:media-id mobj)
:thumbnail-id (:thumbnail-id mobj))
(doseq [id unused]
(l/trc :hint "mark deleted"
:rel "file-media-object"
:id (str id)
:file-id (str file-id)))
;; NOTE: deleting the file-media-object in the database
;; automatically marks as touched the referenced storage
;; objects. The touch mechanism is needed because many files can
;; point to the same storage objects and we can't just delete
;; them.
(db/delete! conn :file-media-object {:id (:id mobj)}))))
(count unused)))
(def ^:private sql:mark-file-object-thumbnails-deleted
"UPDATE file_tagged_object_thumbnail
SET deleted_at = now()
WHERE file_id = ? AND object_id != ALL(?::text[])
RETURNING object_id")
(defn- clean-file-object-thumbnails!
[{:keys [::db/conn ::sto/storage]} file-id data]
(let [stored (->> (db/query conn :file-tagged-object-thumbnail
{:file-id file-id}
{:columns [:object-id]})
(into #{} (map :object-id)))
[{:keys [::db/conn]} file-id data]
(let [using (->> (vals (:pages-index data))
(into #{} (comp
(mapcat (fn [{:keys [id objects]}]
(->> (ctt/get-frames objects)
(map #(assoc % :page-id id)))))
(mapcat (fn [{:keys [id page-id]}]
(list
(thc/fmt-object-id file-id page-id id "frame")
(thc/fmt-object-id file-id page-id id "component")))))))
using (into #{}
(comp
(mapcat (fn [{:keys [id objects]}]
(->> (ctt/get-frames objects)
(map #(assoc % :page-id id)))))
(mapcat (fn [{:keys [id page-id]}]
(list
(thc/fmt-object-id file-id page-id id "frame")
(thc/fmt-object-id file-id page-id id "component")))))
ids (db/create-array conn "text" using)
unused (->> (db/exec! conn [sql:mark-file-object-thumbnails-deleted file-id ids])
(into #{} (map :object-id)))]
(vals (:pages-index data)))
(doseq [object-id unused]
(l/trc :hint "mark deleted"
:rel "file-tagged-object-thumbnail"
:object-id object-id
:file-id (str file-id)))
unused (set/difference stored using)]
(count unused)))
(when (seq unused)
(let [sql (str "delete from file_tagged_object_thumbnail "
" where file_id=? and object_id=ANY(?)"
" returning media_id")
res (db/exec! conn [sql file-id (db/create-array conn "text" unused)])]
(l/dbg :hint "delete file object thumbnails"
:file-id (str file-id)
:total (count res))
(doseq [media-id (into #{} (keep :media-id) res)]
;; Mark as deleted the storage object related with the
;; photo-id field.
(l/trc :hint "touch file object thumbnail storage object" :id (str media-id))
(sto/touch-object! storage media-id))))))
(def ^:private sql:mark-file-thumbnails-deleted
"UPDATE file_thumbnail
SET deleted_at = now()
WHERE file_id = ? AND revn < ?
RETURNING revn")
(defn- clean-file-thumbnails!
[{:keys [::db/conn ::sto/storage]} file-id revn]
(let [sql (str "delete from file_thumbnail "
" where file_id=? and revn < ? "
" returning media_id")
res (db/exec! conn [sql file-id revn])]
[{:keys [::db/conn]} file-id revn]
(let [unused (->> (db/exec! conn [sql:mark-file-thumbnails-deleted file-id revn])
(into #{} (map :revn)))]
(when (seq res)
(l/dbg :hint "delete file thumbnails"
:file-id (str file-id)
:total (count res))
(doseq [revn unused]
(l/trc :hint "mark deleted"
:rel "file-thumbnail"
:revn revn
:file-id (str file-id)))
(doseq [media-id (into #{} (keep :media-id) res)]
;; Mark as deleted the storage object related with the
;; media-id field.
(l/trc :hint "delete file thumbnail storage object" :id (str media-id))
(sto/del-object! storage media-id)))))
(count unused)))
(def ^:private
sql:get-files-for-library
"select f.data, f.modified_at
from file as f
left join file_library_rel as fl on (fl.file_id = f.id)
where fl.library_file_id = ?
and f.modified_at < ?
and f.deleted_at is null
order by f.modified_at desc
limit 1")
(def ^:private sql:get-files-for-library
"SELECT f.id, f.data, f.modified_at
FROM file AS f
LEFT JOIN file_library_rel AS fl ON (fl.file_id = f.id)
WHERE fl.library_file_id = ?
AND f.deleted_at IS null
ORDER BY f.modified_at ASC")
(defn- clean-deleted-components!
"Performs the garbage collection of unreferenced deleted components."
[conn file-id data]
(letfn [(get-files-chunk [cursor]
(let [rows (db/exec! conn [sql:get-files-for-library file-id cursor])]
[(some-> rows peek :modified-at)
(map (comp blob/decode :data) rows)]))
(get-used-components [fdata components]
[{:keys [::db/conn] :as cfg} file-id data]
(letfn [(get-used-components [fdata components]
;; Find which of the components are used in the file.
(into #{}
(filter #(ctf/used-in? fdata file-id % :component))
@ -246,69 +231,91 @@
files-data))]
(let [deleted (into #{} (ctkl/deleted-components-seq data))
unused (->> (d/iteration get-files-chunk :vf second :kf first :initk (dt/now))
unused (->> (db/cursor conn [sql:get-files-for-library file-id] {:chunk-size 1})
(map (fn [{:keys [id data] :as file}]
(binding [pmap/*load-fn* (partial feat.fdata/load-pointer cfg id)]
(-> (blob/decode data)
(feat.fdata/process-pointers deref)))))
(cons data)
(get-unused-components deleted)
(mapv :id))]
(when (seq unused)
(l/dbg :hint "clean deleted components" :total (count unused))
(doseq [id unused]
(l/trc :hint "delete component" :component-id (str id) :file-id (str file-id)))
(let [data (reduce ctkl/delete-component data unused)]
(db/update! conn :file
{:data (blob/encode data)}
{:id file-id}))))))
(when-let [data (some->> (seq unused)
(reduce ctkl/delete-component data)
(blob/encode))]
(db/update! conn :file
{:data data}
{:id file-id}
{::db/return-keys? false}))
(count unused))))
(def ^:private sql:get-changes
"SELECT id, data FROM file_change
WHERE file_id = ? AND data IS NOT NULL
ORDER BY created_at ASC")
(def ^:private sql:mark-deleted-data-fragments
"UPDATE file_data_fragment
SET deleted_at = now()
WHERE file_id = ?
AND id != ALL(?::uuid[])
RETURNING id")
(defn- clean-data-fragments!
[conn file-id data]
(letfn [(get-pointers-chunk [cursor]
(let [sql (str "select id, data, created_at "
" from file_change "
" where file_id = ? "
" and data is not null "
" and created_at < ? "
" order by created_at desc "
" limit 1;")
rows (db/exec! conn [sql file-id cursor])]
[(some-> rows peek :created-at)
(mapcat (comp feat.fdata/get-used-pointer-ids blob/decode :data) rows)]))]
(let [used (->> (db/cursor conn [sql:get-changes file-id])
(into (feat.fdata/get-used-pointer-ids data)
(comp (map :data)
(map blob/decode)
(mapcat feat.fdata/get-used-pointer-ids))))
(let [used (into (feat.fdata/get-used-pointer-ids data)
(d/iteration get-pointers-chunk
:vf second
:kf first
:initk (dt/now)))
unused (let [ids (db/create-array conn "uuid" used)]
(->> (db/exec! conn [sql:mark-deleted-data-fragments file-id ids])
(into #{} (map :id))))]
sql (str "select id from file_data_fragment "
" where file_id = ? AND id != ALL(?::uuid[])")
used (db/create-array conn "uuid" used)
rows (db/exec! conn [sql file-id used])]
(doseq [id unused]
(l/trc :hint "mark deleted"
:rel "file-data-fragment"
:id (str id)
:file-id (str file-id)))
(doseq [fragment-id (map :id rows)]
(l/trc :hint "remove unused file data fragment" :id (str fragment-id))
(db/delete! conn :file-data-fragment {:id fragment-id :file-id file-id})))))
(count unused)))
(defn- process-file
[{:keys [::db/conn] :as cfg} {:keys [id data revn modified-at features] :as file}]
(l/dbg :hint "processing file" :file-id (str id) :modified-at modified-at)
(defn- clean-file!
[{:keys [::db/conn] :as cfg} {:keys [id data revn modified-at] :as file}]
(binding [pmap/*load-fn* (partial feat.fdata/load-pointer cfg id)
pmap/*tracked* (pmap/create-tracked)]
(let [data (-> (blob/decode data)
(assoc :id id)
(pmg/migrate-data))]
(pmg/migrate-data))
(clean-file-media! conn id data)
(clean-file-object-thumbnails! cfg id data)
(clean-file-thumbnails! cfg id revn)
(clean-deleted-components! conn id data)
nfm (clean-file-media! conn id data)
nfot (clean-file-object-thumbnails! cfg id data)
nft (clean-file-thumbnails! cfg id revn)
nc (clean-deleted-components! cfg id data)
ndf (clean-data-fragments! conn id data)]
(when (contains? features "fdata/pointer-map")
(clean-data-fragments! conn id data))
(l/dbg :hint "file clened"
:file-id (str id)
:modified-at (dt/format-instant modified-at)
:media-objects nfm
:thumbnails nft
:object-thumbnails nfot
:components nc
:data-fragments ndf)
;; Mark file as trimmed
(db/update! conn :file
{:has-media-trimmed true}
{:id id})
{:id id}
{::db/return-keys? false})
(feat.fdata/persist-pointers! cfg id))))