dist: forbid multiple inputs which create same file

Current code will allow multiple input data labels
to produce a file with the same name. If flat=True,
subsequent labels will overwrite files from previous
labels. This seems dangerous and could lead to users
unintentionally distributing the wrong file.

By default, forbid multiple inputs that create the
same file in the output directory, but provide an
argument to allow legacy behavior (overwriting the
first files copied by the rule).

Bug: 244215515
Change-Id: I2a591e057d66c41e7f5139614f3cc525e1866c17
Signed-off-by: John Moon <quic_johmoo@quicinc.com>
diff --git a/dist/dist.bzl b/dist/dist.bzl
index 23b6596..bab387e 100644
--- a/dist/dist.bzl
+++ b/dist/dist.bzl
@@ -66,6 +66,7 @@
         archive_prefix = None,
         dist_dir = None,
         wipe_dist_dir = None,
+        allow_duplicate_filenames = None,
         log = None,
         **kwargs):
     """A dist rule to copy files out of Bazel's output directory into a custom location.
@@ -103,6 +104,14 @@
           See details by running the target with `--help`.
         wipe_dist_dir: If true, and `dist_dir` already exists, `dist_dir` will be removed prior to
           copying.
+        allow_duplicate_filenames: If true, duplicate filenames from different sources will be allowed to
+          be copied to the same `dist_dir` (with subsequent sources overwriting previous sources).
+
+          With this option enabled, order matters. The final source of the file listed in `data` will be the
+          final version copied.
+
+          Use of this option is discouraged. Preferably, the input `data` targets would not include labels
+          which produce a duplicate filename. This option is available as a last resort.
         log: If specified, `--log <log>` is provided to the script by default. This sets the
           default log level of the script.
 
@@ -128,6 +137,8 @@
         default_args += ["--dist_dir", dist_dir]
     if wipe_dist_dir:
         default_args.append("--wipe_dist_dir")
+    if allow_duplicate_filenames:
+        default_args.append("--allow_duplicate_filenames")
     if log != None:
         default_args += ["--log", log]
 
diff --git a/dist/dist.py b/dist/dist.py
index 580b686..c384664 100644
--- a/dist/dist.py
+++ b/dist/dist.py
@@ -32,6 +32,7 @@
 """
 
 import argparse
+import collections
 import glob
 import logging
 import os
@@ -39,6 +40,21 @@
 import sys
 import tarfile
 
+def ensure_unique_filenames(files):
+    basename_to_srcs_map = collections.defaultdict(list)
+    for f in files:
+        basename_to_srcs_map[os.path.basename(f)].append(f)
+
+    duplicates_exist = False
+    for (basename, srcs) in basename_to_srcs_map.items():
+        if len(srcs) > 1:
+            duplicates_exist = True
+            logging.error('Destination filename "%s" has multiple possible sources: %s',
+                         basename, srcs)
+
+    if duplicates_exist:
+        sys.exit(1)
+
 
 def files_to_dist(pattern):
     # Assume that dist.bzl is in the same package as dist.py
@@ -56,7 +72,10 @@
 
 
 def copy_files_to_dist_dir(files, archives, dist_dir, flat, prefix,
-    strip_components, archive_prefix, wipe_dist_dir, **ignored):
+    strip_components, archive_prefix, wipe_dist_dir, allow_duplicate_filenames, **ignored):
+
+    if flat and not allow_duplicate_filenames:
+        ensure_unique_filenames(files)
 
     if wipe_dist_dir and os.path.exists(dist_dir):
         shutil.rmtree(dist_dir)
@@ -139,6 +158,11 @@
         action="store_true",
         help="remove existing dist_dir prior to running"
     )
+    parser.add_argument(
+        "--allow_duplicate_filenames",
+        action="store_true",
+        help="allow multiple files with the same name to be copied to dist_dir (overwriting)"
+    )
 
     args = parser.parse_args(sys.argv[1:])