diff --git a/backend/src/app/tasks/file_media_gc.clj b/backend/src/app/tasks/file_media_gc.clj new file mode 100644 index 000000000..ae8a2274b --- /dev/null +++ b/backend/src/app/tasks/file_media_gc.clj @@ -0,0 +1,103 @@ +;; This Source Code Form is subject to the terms of the Mozilla Public +;; License, v. 2.0. If a copy of the MPL was not distributed with this +;; file, You can obtain one at http://mozilla.org/MPL/2.0/. +;; +;; This Source Code Form is "Incompatible With Secondary Licenses", as +;; defined by the Mozilla Public License, v. 2.0. +;; +;; Copyright (c) 2020 UXBOX Labs SL + +(ns app.tasks.file-media-gc + "A maintenance task that is responsible to purge the unused media + objects from files. A file is ellegible to be garbage collected + after some period of inactivity (the default threshold is 72h)." + (:require + [app.common.pages.migrations :as pmg] + [app.config :as cfg] + [app.db :as db] + [app.tasks :as tasks] + [app.util.blob :as blob] + [app.util.time :as dt] + [clojure.tools.logging :as log])) + +(defn decode-row + [{:keys [data] :as row}] + (cond-> row + (bytes? data) (assoc :data (blob/decode data)))) + +(def sql:retrieve-candidates-chunk + "select f.id, + f.data, + extract(epoch from (now() - f.modified_at))::bigint as age + from file as f + where f.has_media_trimmed is false + and f.modified_at < now() - ?::interval + order by f.modified_at asc + limit 10 + for update skip locked") + +(defn retrieve-candidates + "Retrieves a list of files that are candidates to be garbage + collected." + [conn] + (let [threshold (:file-trimming-threshold cfg/config) + interval (db/interval threshold)] + (->> (db/exec! conn [sql:retrieve-candidates-chunk interval]) + (map (fn [{:keys [age] :as row}] + (assoc row :age (dt/duration {:seconds age}))))))) + +(def collect-media-xf + (comp + (map :objects) + (mapcat vals) + (filter #(= :image (:type %))) + (map :metadata) + (map :id))) + +(defn- collect-used-media + [data] + (-> #{} + (into collect-media-xf (vals (:pages-index data))) + (into collect-media-xf (vals (:components data))) + (into (keys (:media data))))) + +(defn- process-file + [conn {:keys [id data age] :as file}] + (let [data (-> (blob/decode data) + (assoc :id id) + (pmg/migrate-data)) + + used (collect-used-media data) + unused (->> (db/query conn :media-object {:file-id id}) + (remove #(contains? used (:id %))))] + + (log/infof "processing file: id='%s' age='%s' to-delete=%s" id age (count unused)) + + ;; Mark file as trimmed + (db/update! conn :file + {:has-media-trimmed true} + {:id id}) + + (doseq [mobj unused] + (log/debugf "schduling object deletion: id='%s' path='%s' delay='%s'" + (:id mobj) (:path mobj) cfg/default-deletion-delay) + (tasks/submit! conn {:name "delete-object" + :delay cfg/default-deletion-delay + :props {:id id :type :media-object}}) + + ;; Mark object as deleted + (db/update! conn :media-object + {:deleted-at (dt/now)} + {:id id})) + + nil)) + +(defn handler + [_task] + (log/debug "running 'file-media-gc' task.") + (db/with-atomic [conn db/pool] + (loop [] + (let [files (retrieve-candidates conn)] + (when (seq files) + (run! (partial process-file conn) files) + (recur)))))) diff --git a/backend/src/app/tasks/trim_file.clj b/backend/src/app/tasks/trim_file.clj deleted file mode 100644 index 6ff2146e5..000000000 --- a/backend/src/app/tasks/trim_file.clj +++ /dev/null @@ -1,102 +0,0 @@ -;; This Source Code Form is subject to the terms of the Mozilla Public -;; License, v. 2.0. If a copy of the MPL was not distributed with this -;; file, You can obtain one at http://mozilla.org/MPL/2.0/. -;; -;; This Source Code Form is "Incompatible With Secondary Licenses", as -;; defined by the Mozilla Public License, v. 2.0. -;; -;; Copyright (c) 2020 UXBOX Labs SL - -(ns app.tasks.trim-file - (:require - [app.common.pages.migrations :as pmg] - [app.config :as cfg] - [app.db :as db] - [app.tasks :as tasks] - [app.util.blob :as blob] - [app.util.time :as dt] - [clojure.tools.logging :as log])) - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; Task: Trim File -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -;; This is the task responsible of removing unnecesary media-objects -;; associated with file but not used by any page. - -(defn decode-row - [{:keys [data] :as row}] - (cond-> row - (bytes? data) (assoc :data (blob/decode data)))) - -(def sql:retrieve-files-to-trim - "select f.id, f.data - from file as f - where f.has_media_trimmed is false - and f.modified_at < now() - ?::interval - order by f.modified_at asc - limit 10") - -(defn retrieve-candidates - "Retrieves a list of ids of files that are candidates to be trimed. A - file is considered candidate when some time passes whith no - modification." - [conn] - (let [threshold (:file-trimming-threshold cfg/config) - interval (db/interval threshold)] - (db/exec! conn [sql:retrieve-files-to-trim interval]))) - -(def collect-media-xf - (comp - (map :objects) - (mapcat vals) - (filter #(= :image (:type %))) - (map :metadata) - (map :id))) - -(defn collect-used-media - [data] - (-> #{} - (into collect-media-xf (vals (:pages-index data))) - (into collect-media-xf (vals (:components data))) - (into (keys (:media data))))) - -(defn process-file - [{:keys [id data] :as file}] - (log/debugf "Processing file: '%s'." id) - (db/with-atomic [conn db/pool] - (let [mobjs (map :id (db/query conn :media-object {:file-id id})) - data (-> (blob/decode data) - (assoc :id id) - (pmg/migrate-data)) - - used (collect-used-media data) - unused (into #{} (remove #(contains? used %)) mobjs)] - - (log/debugf "Collected media ids: '%s'." (pr-str used)) - (log/debugf "Unused media ids: '%s'." (pr-str unused)) - - (db/update! conn :file - {:has-media-trimmed true} - {:id id}) - - (doseq [id unused] - ;; TODO: add task batching - (tasks/submit! conn {:name "delete-object" - ;; :delay cfg/default-deletion-delay - :delay 10000 - :props {:id id :type :media-object}}) - - (db/update! conn :media-object - {:deleted-at (dt/now)} - {:id id})) - nil))) - -(defn handler - [_task] - (log/debug "Running 'trim-file' task.") - (loop [] - (let [files (retrieve-candidates db/pool)] - (when (seq files) - (run! process-file files) - (recur))))) diff --git a/backend/src/app/worker.clj b/backend/src/app/worker.clj index f7de49abf..7462ced1e 100644 --- a/backend/src/app/worker.clj +++ b/backend/src/app/worker.clj @@ -18,7 +18,7 @@ [app.tasks.maintenance] [app.tasks.remove-media] [app.tasks.sendmail] - [app.tasks.trim-file] + [app.tasks.file-media-gc] [app.util.async :as aa] [app.util.time :as dt] [clojure.core.async :as a] @@ -52,9 +52,9 @@ :cron #app/cron "0 0 0 */1 * ? *" ;; daily :fn #'app.tasks.remove-media/trim-media-storage} - {:id "trim-file" + {:id "file-media-gc" :cron #app/cron "0 0 0 */1 * ? *" ;; daily - :fn #'app.tasks.trim-file/handler} + :fn #'app.tasks.file-media-gc/handler} {:id "maintenance/delete-executed-tasks" :cron #app/cron "0 0 0 */1 * ?" ;; daily