Skip to content
Open
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
335 changes: 335 additions & 0 deletions tools/bin/mbedtls-move-to-framework
Original file line number Diff line number Diff line change
@@ -0,0 +1,335 @@
#!/usr/bin/env python3
"""Move files from the Mbed TLS repository to the framework repository.

Invoke this script from a Git working directory with a branch of Mbed TLS.
"""

# Copyright The Mbed TLS Contributors
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import os
import pathlib
import re
import subprocess
import sys
from typing import List, Optional, Tuple, Dict


class RepoFileMover:
"""Move files between repos while retaining history."""

GIT_EXE = 'git'
FRAMEWORK_DEFAULT_BASE_BRANCH = 'main'

def __init__(self, source_repo: str, dest_repo: str,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please be consistent when abbreviating “destination”: either dst or dest, not both.

source_branch_name: str, dest_branch_name: str,
file_map: Dict[str, str],
dest_base_branch_name: Optional[str] = None):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be nice to document the parameters. It's not obvious what you can put in file_map, or the difference between dest_branch_name and dest_base_branch_name.

self._source_repo = os.path.abspath(source_repo)
self._dest_repo = os.path.abspath(dest_repo)

self._src_branch_name = source_branch_name
self._tmp_framework_branch_name = 'tmp-branch-move-files-to-framework'
self._framework_branch_name = dest_branch_name

if dest_base_branch_name is None:
self._framework_base_branch_name = self.FRAMEWORK_DEFAULT_BASE_BRANCH
else:
self._framework_base_branch_name = dest_base_branch_name

self._file_map = {os.path.normpath(k): os.path.normpath(v) for k, v in file_map.items()}

def run_git(self, git_cmd: List[str], **kwargs) -> str:
"""Run a git command and return its output."""
if 'universal_newlines' not in kwargs:
kwargs['universal_newlines'] = True
cmd = [self.GIT_EXE] + git_cmd
return subprocess.check_output(cmd, **kwargs)

def add_file_move(self, src_path: str, dst_path: str) -> None:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This method isn't used anywhere. __init__ should probably call it instead of setting _file_map on its own.

"""Move file at relative path src_path in the source repo to dst_path
in the destination repo"""
self._file_map[src_path] = dst_path
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why no normalization like there is for the initialization of _file_map?


def _path_parts(self, path: str) -> List[str]:
return path.split(os.path.sep)

def _direct_children(self, dir_path: str) -> List[str]:
"""Get the direct children of a subdirectory (at least the ones
that matter to git)"""
Comment on lines +98 to +99
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Common docstring conventions say to make the first paragraph of a docstring a single line. Mind you, we don't always follow that even in the mbedtls repository. But I hope that one day we'll set up documentation rendering for our Python scripts and then we'll want to be more uniform about docstring formatting.

leaf_files = self.run_git(['ls-files', dir_path]).split()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we ever have a file name containing a space, this will split on the space. (Control characters in file names would also not work properly.) The robust way would be

Suggested change
leaf_files = self.run_git(['ls-files', dir_path]).split()
leaf_files = self.run_git(['ls-files', '-z', '--', dir_path]).rstrip('\0').split('\0')

This applies to the other uses of ls-files.

It would be good to make an auxiliary method that calls ls-files and returns a list.

path_len = len(self._path_parts(dir_path)) + 1
direct_children = set([
os.path.join(
*self._path_parts(f)[:path_len]
)
for f in leaf_files
])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this would be equivalent but simpler: run git -C dir_path ls-files and strip anything after a slash.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Though looking at how this method is used, it seems that you're stripping the paths down to base names here, only to reconstruct the full path later in _expand_dir_move?

Copy link
Contributor Author

@davidhorstmann-arm davidhorstmann-arm Nov 28, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This part removes the suffix that git ls-files outputs. We might get:

foo/bar/a.c
foo/bar/b.c
foo/baz

Which we reduce to just the direct descendants of foo, so:

foo/bar
foo/baz

We never reconstruct the suffix in _expand_dir_move(), but we substitute the prefix, e.g. if foo is being moved to new_foo:

new_foo/bar
new_foo/baz

This is a part of the transformation required for the --except option.

I was not aware that passing -C with a directory had this effect though, that will certainly simplify this.

return list(direct_children)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The caller only cares about iterating once, not about having a stored object in a particular format, so you could return the set directly and declare the return type as Iterable[str].

However, a quirk of the current code is that the order of traversal of a set is unspecified, and in practice it's randomized with all modern versions of CPython. For debugging, it sometimes help to have deterministic code, which you can get by sorting, which returns a list.

Suggested change
return list(direct_children)
return sorted(direct_children)


def _expand_dir_move(self, dir_path: str) -> None:
"""Expand a directory move into the moves of its children"""
if dir_path in self._file_map:
# Get the direct children of this directory
expanded_paths = self._direct_children(dir_path)
# Make an equivalent mapping for each direct child
for path in expanded_paths:
self._file_map[path] = (self._file_map[dir_path]
+ path[len(dir_path):])
# Remove the mapping for the original directory, since we want
# some files to remain behind in it.
self._file_map.pop(dir_path, None)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since you know dir_path is present and you aren't using its current value, the simpler way is

Suggested change
self._file_map.pop(dir_path, None)
del self._file_map.pop[dir_path]


def add_exception(self, src_path: str) -> None:
path_components = self._path_parts(src_path)
# Expand the directories prefixing the exception,
# starting with the shortest prefix
for i in range(1, len(path_components)):
self._expand_dir_move(os.path.join(*path_components[:i]))
# Remove the exception
self._file_map.pop(src_path, None)

def ensure_dirs_exist(self, path: str) -> None:
"""Make sure all of the required directories exist for
moving a given file or directory to it."""

# Remove any trailing slashes
while path[-1] == os.sep:
path = path[:-1]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Simpler:

Suggested change
while path[-1] == os.sep:
path = path[:-1]
path = path.rstrip(os.sep)


# Get the directory names
dirs, file = os.path.split(path)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Simpler:

Suggested change
dirs, file = os.path.split(path)
dirs = os.path.dirname(path)


if dirs != '':
# Make all required directories
os.makedirs(dirs, exist_ok=True)

def create_dest_repo_branch(self):
"""Create the branch containing the moved files only"""

# Create a new branch
self.run_git(['checkout', '-b', self._tmp_framework_branch_name])

# Get a list of all files in the repo
repo_files = self.run_git(['ls-files']).split()

# Get a set of the files we are moving (taking account of directories)
files_to_retain = set()
for f in self._file_map:
if os.path.isdir(f):
dir_files = self.run_git(['ls-files', f]).split()
dir_files = [os.path.normpath(dir_file) for dir_file in dir_files]
files_to_retain.update(dir_files)
else:
files_to_retain.add(os.path.normpath(f))

# Delete all files except the ones we are moving
for f in repo_files:
if os.path.normpath(f) not in files_to_retain:
self.run_git(['rm', f])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we ever have a file whose name starts with a dash (granted, that's unlikely), git will interpret that file's name as an option. Applies everywhere we pass a file name to git.

Suggested change
self.run_git(['rm', f])
self.run_git(['rm', '--', f])

Copy link
Contributor

@gilles-peskine-arm gilles-peskine-arm Oct 8, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BLOCKER: As far as I understand it (more info in Slack thread), this will in particular remove framework. So from that point, when checking out the original head again, the framework submodule is no longer initialized. Then later the framework directory itself disappears, which I don't understand.

I don't fully understand what's going on, but as far as I can tell, you can't have the destination be the framework directory of the source, it has to be a separate worktree. This is a very unintuitive limitation that needs to be at least documented.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is now documented and I have added a check so that an error message is printed if you try to move files to a framework submodule within the Mbed TLS checkout that you are moving from.


# Rename files as requested
for f in self._file_map:
# Git won't let us move things to themselves
if self._file_map[f] != f:
self.ensure_dirs_exist(self._file_map[f])
self.run_git(['mv', f, self._file_map[f]])

# Commit the result
commit_message = "Move some files to framework repository"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would like the ability to specify a commit message. I can do it afterwards by rebasing, and that's not really a problem in the source branch, but in the destination branch, the commit message is behind a merge and git is reluctant to edit history beyond a merge.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In fact, if I want to edit the commit message, I need to redo the conflict resolution which is implemented here as resolve_deletion_conflicts. That's not easy!

self.run_git(['commit', '-asm', commit_message])

def create_src_repo_branch(self):
"""Create the branch deleting the moved files"""

# Create a new branch
self.run_git(['checkout', '-b', self._src_branch_name])

# Delete the moved files
files_to_delete = self._file_map.keys()
for f in files_to_delete:
if os.path.isdir(f):
self.run_git(['rm', '-r', f])
else:
self.run_git(['rm', f])

# Commit the result
commit_message = "Move some files to framework repository"
self.run_git(['commit', '-asm', commit_message])

def resolve_deletion_conflicts(self):
file_statuses = self.run_git(['status', '--porcelain'])

# Get conflicting files
files_by_status = {}
for line in file_statuses.splitlines():
# Process lines like:
# D dir1/myfile.txt
# M myfile.csv
# etc
line = line.strip().split()

if len(line) != 2:
continue
status = line[0]
filename = line[1]

if len(status) == 2:
# Check cases where conflicts need to be resolved
if status not in ('AD', 'DA', 'UD', 'DU'):
# Resolve conflicts only if one file has been deleted.
# We can't deal with multiple-modifications etc
return False

if status in files_by_status:
files_by_status[status].append(filename)
else:
files_by_status[status] = [filename]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Arguably simpler:

Suggested change
if status in files_by_status:
files_by_status[status].append(filename)
else:
files_by_status[status] = [filename]
files_by_status.setdefault(status, [])
files_by_status[status].append(filename)


# Restore any deleted files
if 'D' in files_by_status:
for f in files_by_status['D']:
self.run_git(['checkout', 'HEAD', '--', f])

# Add all files
self.run_git(['add', '.'])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This assumes that there are no files that git calls “other” (neither checked in nor ignored). That's an acceptable limitation, but please document it. Ideally the script should refuse to run if such files are present.


return True

def merge_files_into_framework(self):
"""Add the source repo as a remote, pull in the destination branch
and merge into a new branch"""
# Change to the desination repo
os.chdir(self._dest_repo)

# Fetch/checkout the branch that has the moved files on it
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's no checkout here.

self.run_git(['fetch', self._source_repo,
self._tmp_framework_branch_name + ':' + self._tmp_framework_branch_name])

# Checkout the default branch
self.run_git(['checkout', self._framework_base_branch_name])

# Create a new branch with checkout -b
self.run_git(['checkout', '-b', self._framework_branch_name])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add checks for common errors before starting to do anything. At least check that the branches don't already exist, which is likely to be the case the second time the same person uses the script.


try:
# Merge in the previously-fetched branch using --allow-unrelated-histories
self.run_git(['merge', self._tmp_framework_branch_name, '--allow-unrelated-histories'])
except subprocess.CalledProcessError as git_error:
# We have conflicts, resolve them if possible
if not self.resolve_deletion_conflicts():
raise git_error

# Continue the merge without opening an editor
self.run_git(['-c', 'core.editor=/bin/true',

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This fails on MacOS as /bin/true does not exist in that directory. Instead it is /usr/bin/true.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can just use true, Git will do a path lookup. (Actually you can even write arbitrary shell code there.)

'merge', '--continue'])

# Edge case: when we are not renaming the files, their deletion in
# previous branches moved to the framework will take precedence without
# merge conficts. Treat these files specially and restore them.
Comment on lines +295 to +297
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wouldn't this also happen when we're renaming the file, and the target name is one that used to exist but was removed before today? Granted that's not very likely to happen.

for f in self._file_map:
if self._file_map[f] == f:
self.run_git(['checkout', self._tmp_framework_branch_name, '--', f])
self.run_git(['add', f])
self.run_git(['-c', 'core.editor=/bin/true',

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This fails on MacOS as /bin/true does not exist in that directory. Instead it is /usr/bin/true.

'commit', '--amend'])
Comment on lines +302 to +303
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't that just

Suggested change
self.run_git(['-c', 'core.editor=/bin/true',
'commit', '--amend'])
self.run_git(['commit', '--amend', '--no-edit'])


def move_files(self):
os.chdir(self._source_repo)

base_commit = self.run_git(['rev-parse', 'HEAD']).strip()

self.create_dest_repo_branch()
# Reset state of repo
self.run_git(['checkout', base_commit])

self.create_src_repo_branch()
# Reset state of repo
self.run_git(['checkout', base_commit])

self.merge_files_into_framework()

# Delete the temporary branches in both repos
self.run_git(['branch', '-D', self._tmp_framework_branch_name])

os.chdir(self._source_repo)
self.run_git(['branch', '-D', self._tmp_framework_branch_name])


def main() -> int:
"""Command line entry point."""
parser = argparse.ArgumentParser()
parser.add_argument('--path', action='append', nargs='?',
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nargs='?' doesn't make sense. This option requires an argument (currently, if you don't pass one, the parser lets it through, but then you end up with None in the list which is consumed by code that requires a string). Applies to --except as well.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rather than require --path to be repeated for each path, I think this should be the positional arguments. In particular it would allow calls like mbedtls-move-to-framework … subdir/*.py.

help='Path to move, (colon-separate for renames)'
' (e.g. "tests/scripts" or "tests/scripts:foo")')
parser.add_argument('--except', action='append', nargs='?',
dest='file_exceptions',
default=[],
help='Exceptions to --path arguments'
' (Files or directories not to move')
parser.add_argument('--from', dest='from_repo',
metavar='FROM', required=True,
help='Path to the Mbed TLS repo to move files from')
parser.add_argument('--to', dest='to_repo', metavar='TO', required=True,
help='Path to the framework repo to move files to')
parser.add_argument('--src-branch',
default='move-files-from-mbedtls-to-framework',
required=False,
help='Name of the new branch create in the Mbed TLS repo')
parser.add_argument('--dst-branch', default='move-files-into-framework',
required=False,
help='Name of the new branch create in the framework repo')
parser.add_argument('--dst-base-branch', default=None, required=False,
help='Base branch in the framework repo to make changes from')
Comment on lines +350 to +351
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That doesn't actually need to be a branch name, it can be any revision. In practice I'd want whatever is the current head of the framework submodule, and I think that should be the default, rather than main.

In fact main is likely not to work in practice, because it'll typically be some old reference made when the working directory was originally cloned I delete the default branch name in my local clones to avoid keeping that old reference around, so when I didn't pass --dst-branch-name, I got the error

fatal: 'main' matched multiple (4) remote tracking branches

parser.add_argument('--print-file-map', action='store_true',
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is typically called --dry-run and often abbreviated -n. The name --print-… suggests that the option enables tracing output but not that it doesn't perform the action.

help='Don\'t perform the move, just print out'
'the file map created (used for debugging)')
args = parser.parse_args()

file_map = {}
for p in args.path:
# If the name contains a colon it is a rename,
# otherwise just a plain path.
if ':' in p:
rename = p.split(':')
if len(rename) != 2:
print('Error: Path rename must be of the form "src:dest"',
file=sys.stderr)
return 1

file_map[rename[0]] = rename[1]

else:
file_map[p] = p

file_mover = RepoFileMover(args.from_repo, args.to_repo,
args.src_branch, args.dst_branch,
file_map,
args.dst_base_branch)

for e in args.file_exceptions:
file_mover.add_exception(e)

if args.print_file_map:
for k, v in file_mover._file_map.items():
print(f'{k}: {v}')
else:
file_mover.move_files()

return 0

if __name__ == '__main__':
sys.exit(main())