kudu-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mpe...@apache.org
Subject [kudu] 02/02: [backup] Add a tool to clean up old backups
Date Tue, 28 May 2019 22:41:53 GMT
This is an automated email from the ASF dual-hosted git repository.

mpercy pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git

commit a5a8da655ca8f0088dcd39301bd9bd87e182c460
Author: Will Berkeley <wdberkeley@gmail.com>
AuthorDate: Wed May 22 16:18:08 2019 -0700

    [backup] Add a tool to clean up old backups
    
    This adds a tool that cleans up old backups. Eligibility for cleanup is
    a property of the backup path, not the actual backup. There are two
    conditions:
    1. The path is not the restore path.
    2. Every backup on the path is older than the expiration age, which the
       tool allows users to express in days. The default is 30 days.
    If a path satisfies both conditions, the tool will delete its backups,
    from most recent to least recent. This way, if the tool terminates
    mid-delete, the next run of the tool can pick up where the previous one
    left off. However, a backup on the restore path will not be deleted,
    even if it lies on a path that is expired.
    
    The tool has a dry run mode, a verbose mode, and a table name filter. It
    does not normally output anything on success.
    
    There are a couple of situations not currently handled by this tool:
    1. Failed backups with no metadata.
    2. Backup paths with no full backup at the beginning.
    This is because the backup graph abstraction doesn't presently handle
    these cases in a way the tool can use. The tool can be further improved
    after a follow up to enhance the backup graph.
    
    Change-Id: I4a4c4dde10ec3c3c9b2233ee2f7b13c46e963e39
    Reviewed-on: http://gerrit.cloudera.org:8080/13405
    Tested-by: Kudu Jenkins
    Reviewed-by: Mike Percy <mpercy@apache.org>
---
 .../scala/org/apache/kudu/backup/BackupIO.scala    |  11 +-
 .../org/apache/kudu/backup/KuduBackupCleaner.scala | 148 +++++++++++++++++++++
 .../org/apache/kudu/backup/TestKuduBackupCLI.scala |  27 +---
 .../apache/kudu/backup/TestKuduBackupCleaner.scala | 135 +++++++++++++++++++
 .../scala/org/apache/kudu/backup/TestUtils.scala   |  42 ++++++
 .../scala/org/apache/kudu/backup/KuduBackup.scala  |   2 +-
 .../org/apache/kudu/backup/TestKuduBackup.scala    |   2 +-
 7 files changed, 337 insertions(+), 30 deletions(-)

diff --git a/java/kudu-backup-tools/src/main/scala/org/apache/kudu/backup/BackupIO.scala b/java/kudu-backup-tools/src/main/scala/org/apache/kudu/backup/BackupIO.scala
index 1cf3140..7807132 100644
--- a/java/kudu-backup-tools/src/main/scala/org/apache/kudu/backup/BackupIO.scala
+++ b/java/kudu-backup-tools/src/main/scala/org/apache/kudu/backup/BackupIO.scala
@@ -77,8 +77,8 @@ class BackupIO(val conf: Configuration, rootPathStr: String) {
   /**
    * Return the backup path for a table and time.
    */
-  def backupPath(table: KuduTable, timestampMs: Long): Path = {
-    new Path(tablePath(table.getTableId, table.getName), timestampMs.toString)
+  def backupPath(tableId: String, tableName: String, timestampMs: Long): Path = {
+    new Path(tablePath(tableId, tableName), timestampMs.toString)
   }
 
   /**
@@ -101,6 +101,13 @@ class BackupIO(val conf: Configuration, rootPathStr: String) {
   }
 
   /**
+   * Deletes the backup.
+   */
+  def deleteBackup(metadata: TableMetadataPB): Unit = {
+    fs.delete(backupPath(metadata.getTableId, metadata.getTableName, metadata.getToMs), true)
+  }
+
+  /**
    * Reads all of the backup graphs.
    */
   def readAllBackupGraphs(): Seq[BackupGraph] = {
diff --git a/java/kudu-backup-tools/src/main/scala/org/apache/kudu/backup/KuduBackupCleaner.scala
b/java/kudu-backup-tools/src/main/scala/org/apache/kudu/backup/KuduBackupCleaner.scala
new file mode 100644
index 0000000..8462aa9
--- /dev/null
+++ b/java/kudu-backup-tools/src/main/scala/org/apache/kudu/backup/KuduBackupCleaner.scala
@@ -0,0 +1,148 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+package org.apache.kudu.backup
+
+import java.time.temporal.ChronoUnit
+import java.time.Duration
+import java.time.Instant
+
+import org.apache.hadoop.conf.Configuration
+import scopt.OptionParser
+
+import org.apache.kudu.backup.Backup.TableMetadataPB
+
+case class BackupCleanerOptions(
+    tables: Seq[String],
+    rootPath: String,
+    expirationAge: Duration,
+    dryRun: Boolean,
+    verbose: Boolean)
+
+object BackupCleanerOptions {
+  val DefaultExpirationAge: Duration = Duration.of(30, ChronoUnit.DAYS)
+
+  val ProgramName: String =
+    KuduBackupCleaner.getClass.getCanonicalName.dropRight(1) // Remove trailing `$`
+
+  val parser: OptionParser[BackupCleanerOptions] =
+    new OptionParser[BackupCleanerOptions](ProgramName) {
+      opt[String]("rootPath")
+        .action((v, o) => o.copy(rootPath = v))
+        .text("The root path to search for backups. Accepts any Hadoop compatible path.")
+        .required()
+
+      opt[String]("expirationAgeDays")
+        .action((v, o) => o.copy(expirationAge = Duration.of(v.toLong, ChronoUnit.DAYS)))
+        .text("The age at which old backups should be expired. Backups that are part of the
current restore path are never expired.")
+        .optional()
+
+      opt[Boolean]("dryRun")
+        .action((v, o) => o.copy(dryRun = v))
+        .text(
+          "Report on what backups will be deleted, but don't delete anything. Overrides --verbose.")
+        .optional()
+
+      opt[Boolean]("verbose")
+        .action((v, o) => o.copy(verbose = v))
+        .text("Report on what backups are deleted.")
+        .optional()
+
+      arg[String]("<table>...")
+        .unbounded()
+        .action((v, o) => o.copy(tables = o.tables :+ v))
+        .text("A list of tables whose backups should be garbage-collected. Specifying no
tables includes all tables.")
+        .optional()
+
+      help("help").text("Prints this usage text")
+    }
+
+  def parse(args: Seq[String]): Option[BackupCleanerOptions] = {
+    parser.parse(
+      args,
+      BackupCleanerOptions(Seq(), null, DefaultExpirationAge, dryRun = false, verbose = false))
+  }
+}
+
+object KuduBackupCleaner {
+
+  private def backupToShortString(metadata: TableMetadataPB): String = {
+    s"name: ${metadata.getTableName}, id: ${metadata.getTableId}, fromMs: ${metadata.getFromMs},
toMs: ${metadata.getToMs}"
+  }
+
+  // Run the cleanup tool with the given options. Like a command, returns 0 if successful,
or
+  // a nonzero error code.
+  def run(options: BackupCleanerOptions): Int = {
+    // Delete the metadata for all backups that satisfy the following three conditions:
+    // 1. The table name matches the provided names (does not apply if no names were specified).
+    // 2. The backup is part of a path whose latest backup is older than the expiration age.
+    // 3. The backup is not on the current restore path.
+    // TODO(KUDU-2827): Consider dropped tables eligible for deletion once they reach a certain
age.
+    val io: BackupIO = new BackupIO(new Configuration(), options.rootPath)
+    val backupGraphs =
+      if (options.tables.isEmpty)
+        io.readAllBackupGraphs()
+      else
+        io.readBackupGraphsByTableName(options.tables)
+    val now = Instant.now()
+
+    val tableNameSet = options.tables.toSet
+    backupGraphs.foreach { graph =>
+      val expiredPaths = graph.backupPaths.filter(path => {
+        val lastBackupInstant = Instant.ofEpochSecond(path.lastBackup.metadata.getToMs /
1000)
+        now.isAfter(lastBackupInstant.plus(options.expirationAge))
+      })
+
+      // The graph might be for a table that was once named a name in 'options.tables', but
we only
+      // want to clean up tables whose current name is in 'options.tables'.
+      // TODO: This is temporary. It will change when pattern support is added.
+      val currentTableName = graph.restorePath.tableName
+      if (tableNameSet.isEmpty || tableNameSet.contains(currentTableName)) {
+        // For each expired path, iterate over it from latest backup to earliest backup and
delete
+        // the backup, unless the backup-to-be-deleted is also part of the restore path.
Deleting
+        // from last to first in the path ensures that if the tool crashes partway through
then a
+        // prefix of the backup path is preserved and the tool can delete the rest of the
eligible
+        // backups next time it runs.
+        val restoreSet = graph.restorePath.backups.toSet
+        expiredPaths.foreach(path => {
+          path.backups
+            .filterNot(restoreSet.contains)
+            .reverseMap(backup => {
+              if (options.dryRun) {
+                println(s"DRY RUN: Delete backup ${backupToShortString(backup.metadata)}")
+              } else {
+                if (options.verbose) {
+                  println(s"Delete backup ${backupToShortString(backup.metadata)}")
+                }
+                // TODO(wdberkeley): Make this crash-consistent by handling backup directories
with no
+                //  metadata.
+                io.deleteBackup(backup.metadata)
+              }
+            })
+        })
+      }
+    }
+
+    0
+  }
+
+  def main(args: Array[String]): Unit = {
+    val options = BackupCleanerOptions
+      .parse(args)
+      .getOrElse(throw new IllegalArgumentException("could not parse the arguments"))
+    System.exit(run(options))
+  }
+}
diff --git a/java/kudu-backup-tools/src/test/scala/org/apache/kudu/backup/TestKuduBackupCLI.scala
b/java/kudu-backup-tools/src/test/scala/org/apache/kudu/backup/TestKuduBackupCLI.scala
index 8302c8c..aec1686 100644
--- a/java/kudu-backup-tools/src/test/scala/org/apache/kudu/backup/TestKuduBackupCLI.scala
+++ b/java/kudu-backup-tools/src/test/scala/org/apache/kudu/backup/TestKuduBackupCLI.scala
@@ -23,7 +23,6 @@ import java.nio.file.Path
 import java.text.SimpleDateFormat
 
 import org.apache.commons.io.FileUtils
-import org.apache.hadoop.fs.{Path => HPath}
 import org.apache.hadoop.conf.Configuration
 import org.junit.After
 import org.junit.Assert._
@@ -32,9 +31,6 @@ import org.junit.Test
 import org.slf4j.Logger
 import org.slf4j.LoggerFactory
 
-import org.apache.kudu.backup.Backup.TableMetadataPB
-import org.apache.kudu.backup.TableMetadata.MetadataVersion
-
 class TestKuduBackupCLI {
   val log: Logger = LoggerFactory.getLogger(getClass)
 
@@ -50,27 +46,6 @@ class TestKuduBackupCLI {
     FileUtils.deleteDirectory(rootDir.toFile)
   }
 
-  // Create dummy table metadata and write it to the test directory.
-  private def createTableMetadata(
-      io: BackupIO,
-      tableName: String,
-      fromMs: Long,
-      toMs: Long): Unit = {
-    // Create dummy table metadata with just enough information to be used to create a BackupGraph.
-    val tableId = s"id_$tableName"
-    val metadata = TableMetadataPB
-      .newBuilder()
-      .setVersion(MetadataVersion)
-      .setFromMs(fromMs)
-      .setToMs(toMs)
-      .setTableName(tableName)
-      .setTableId(tableId)
-      .build()
-    val backupPath = new HPath(io.tablePath(tableId, tableName), s"$toMs")
-    val metadataPath = io.backupMetadataPath(backupPath)
-    io.writeTableMetadata(metadata, metadataPath)
-  }
-
   // Helper to write a standard collection of backup metadata useful for a few tests.
   private def createStandardTableMetadata(io: BackupIO): Unit = {
     Seq(
@@ -84,7 +59,7 @@ class TestKuduBackupCLI {
       ("pizza", 400, 600)
     ).foreach {
       case (tableName: String, fromMs: Int, toMs: Int) =>
-        createTableMetadata(io, tableName, fromMs, toMs)
+        TestUtils.createTableMetadata(io, tableName, fromMs, toMs)
     }
   }
 
diff --git a/java/kudu-backup-tools/src/test/scala/org/apache/kudu/backup/TestKuduBackupCleaner.scala
b/java/kudu-backup-tools/src/test/scala/org/apache/kudu/backup/TestKuduBackupCleaner.scala
new file mode 100644
index 0000000..ffc7c45
--- /dev/null
+++ b/java/kudu-backup-tools/src/test/scala/org/apache/kudu/backup/TestKuduBackupCleaner.scala
@@ -0,0 +1,135 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+package org.apache.kudu.backup
+
+import java.nio.file.Files
+import java.nio.file.Path
+import java.time.temporal.ChronoUnit
+import java.time.Duration
+import java.time.Instant
+
+import org.apache.commons.io.FileUtils
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.{Path => HPath}
+import org.junit.After
+import org.junit.Assert._
+import org.junit.Before
+import org.junit.Test
+import org.slf4j.Logger
+import org.slf4j.LoggerFactory
+
+class TestKuduBackupCleaner {
+  val log: Logger = LoggerFactory.getLogger(getClass)
+
+  var rootDir: Path = _
+
+  @Before
+  def setUp(): Unit = {
+    rootDir = Files.createTempDirectory("backupcli")
+  }
+
+  @After
+  def tearDown(): Unit = {
+    FileUtils.deleteDirectory(rootDir.toFile)
+  }
+
+  // Return the epoch time in milliseconds that is 'secsBefore' seconds before 'current'.
+  private def epochMillisBefore(current: Instant, secsBefore: Long): Long = {
+    current.minus(Duration.of(secsBefore, ChronoUnit.SECONDS)).getEpochSecond * 1000
+  }
+
+  @Test
+  def testBackupCleaner(): Unit = {
+    val io = new BackupIO(new Configuration(), rootDir.toUri.toString)
+    val expirationAge = Duration.of(15, ChronoUnit.SECONDS)
+    val now = Instant.now
+    val tableName = "taco"
+
+    val createPath = (ages: Array[Long]) => {
+      for (i <- ages.indices) {
+        val fromMs = if (i == 0) 0 else epochMillisBefore(now, ages(i - 1))
+        val toMs = epochMillisBefore(now, ages(i))
+        TestUtils.createTableMetadata(io, tableName, fromMs, toMs)
+      }
+    }
+
+    // Create a graph of backups for a table incrementally. At first, there'll be one backup
path,
+    // path A, which must therefore be the restore path. All of its backups will be older
than the
+    // expiration age.
+    val pathA: Array[Long] = Array(25, 21, 16)
+    createPath(pathA)
+
+    // Nothing should be cleaned up because all backups are on the restore path.
+    val options =
+      BackupCleanerOptions(
+        Seq(),
+        rootDir.toUri.toString,
+        expirationAge,
+        dryRun = false,
+        verbose = true)
+    assertEquals(0, KuduBackupCleaner.run(options))
+
+    val backupExists = (secsAgo: Long) => {
+      val backupPath =
+        new HPath(io.tablePath(s"id_$tableName", tableName), s"${epochMillisBefore(now, secsAgo)}")
+      val metadataPath = io.backupMetadataPath(backupPath)
+      println(s"checking existence of $metadataPath")
+      io.fs.exists(metadataPath)
+    }
+
+    assertTrue(pathA.forall(backupExists(_)))
+
+    // Now add a new backup path, path B, that ends at a later time and so is the new restore
path.
+    val pathB: Array[Long] = Array(20, 15, 10, 5, 1)
+    createPath(pathB)
+
+    // Add a backup with a from time of now - 20 and a to time of now - 18. The backup path
that
+    // ends in this backup is expired, but it forks from the restore path.
+    TestUtils
+      .createTableMetadata(io, tableName, epochMillisBefore(now, 20), epochMillisBefore(now,
18))
+
+    // Running the cleaner should delete path A and the forked backup, but first do a dry
run and
+    // make sure nothing gets deleted.
+    val dryRunOptions = BackupCleanerOptions(
+      Seq(),
+      rootDir.toUri.toString,
+      expirationAge,
+      dryRun = true,
+      verbose = false)
+    assertEquals(0, KuduBackupCleaner.run(dryRunOptions))
+
+    assertTrue(pathA.forall(backupExists(_)))
+    assertTrue(backupExists(18))
+    assertTrue(pathB.forall(backupExists(_)))
+
+    // After the cleaner runs, path A and the forked backup should be deleted and path B
should remain.
+    assertEquals(0, KuduBackupCleaner.run(options))
+    assertTrue(pathA.forall(!backupExists(_)))
+    assertTrue(!backupExists(18))
+    assertTrue(pathB.forall(backupExists(_)))
+
+    // Finally, add a third path which is not the restore path but which has backups that
are old
+    // enough to get deleted and backups that are too new to be deleted.
+    val pathC: Array[Long] = Array(19, 14, 9, 4, 2)
+    createPath(pathC)
+
+    assertEquals(0, KuduBackupCleaner.run(options))
+    assertTrue(pathA.forall(!backupExists(_)))
+    assertTrue(pathB.forall(backupExists(_)))
+    assertTrue(pathC.forall(backupExists(_)))
+  }
+}
diff --git a/java/kudu-backup-tools/src/test/scala/org/apache/kudu/backup/TestUtils.scala
b/java/kudu-backup-tools/src/test/scala/org/apache/kudu/backup/TestUtils.scala
new file mode 100644
index 0000000..8bc754a
--- /dev/null
+++ b/java/kudu-backup-tools/src/test/scala/org/apache/kudu/backup/TestUtils.scala
@@ -0,0 +1,42 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+package org.apache.kudu.backup
+
+import org.apache.hadoop.fs.Path
+
+import org.apache.kudu.backup.Backup.TableMetadataPB
+import org.apache.kudu.backup.TableMetadata.MetadataVersion
+
+object TestUtils {
+
+  // Create dummy table metadata and write it to the test directory.
+  def createTableMetadata(io: BackupIO, tableName: String, fromMs: Long, toMs: Long): Unit
= {
+    // Create dummy table metadata with just enough information to be used to create a BackupGraph.
+    val tableId = s"id_$tableName"
+    val metadata = TableMetadataPB
+      .newBuilder()
+      .setVersion(MetadataVersion)
+      .setFromMs(fromMs)
+      .setToMs(toMs)
+      .setTableName(tableName)
+      .setTableId(tableId)
+      .build()
+    val backupPath = new Path(io.tablePath(tableId, tableName), s"$toMs")
+    val metadataPath = io.backupMetadataPath(backupPath)
+    io.writeTableMetadata(metadata, metadataPath)
+  }
+}
diff --git a/java/kudu-backup/src/main/scala/org/apache/kudu/backup/KuduBackup.scala b/java/kudu-backup/src/main/scala/org/apache/kudu/backup/KuduBackup.scala
index 180e7c8..65adcaf 100644
--- a/java/kudu-backup/src/main/scala/org/apache/kudu/backup/KuduBackup.scala
+++ b/java/kudu-backup/src/main/scala/org/apache/kudu/backup/KuduBackup.scala
@@ -70,7 +70,7 @@ object KuduBackup {
       var tableOptions = options.copy() // Copy the options so we can modify them for the
table.
       val table = context.syncClient.openTable(tableName)
       val tableId = table.getTableId
-      val backupPath = io.backupPath(table, tableOptions.toMs)
+      val backupPath = io.backupPath(table.getTableId, table.getName, tableOptions.toMs)
       val metadataPath = io.backupMetadataPath(backupPath)
       log.info(s"Backing up table $tableName to path: $backupPath")
 
diff --git a/java/kudu-backup/src/test/scala/org/apache/kudu/backup/TestKuduBackup.scala b/java/kudu-backup/src/test/scala/org/apache/kudu/backup/TestKuduBackup.scala
index 8a066ac..ec7d5bc 100644
--- a/java/kudu-backup/src/test/scala/org/apache/kudu/backup/TestKuduBackup.scala
+++ b/java/kudu-backup/src/test/scala/org/apache/kudu/backup/TestKuduBackup.scala
@@ -575,7 +575,7 @@ class TestKuduBackup extends KuduTestSuite {
     val io = new BackupIO(ss.sparkContext.hadoopConfiguration, options.rootPath)
     val tableName = options.tables.head
     val table = harness.getClient.openTable(tableName)
-    val backupPath = io.backupPath(table, options.toMs)
+    val backupPath = io.backupPath(table.getTableId, table.getName, options.toMs)
     val metadataPath = io.backupMetadataPath(backupPath)
     val metadata = io.readTableMetadata(metadataPath)
 


Mime
View raw message