Skip to content

Commit f47b9a4

Browse files
authored
KAFKA-20362 Resolve reviewer email via GitHub commit search API (#22135)
The `resolve_reviewer` introduced in #22108 used local `git log` to find past `Reviewers:` trailers, but the PR linter runs with the default shallow checkout (`fetch-depth: 1`), so older merged PR trailers are not available on the runner. This PR switches reviewer email resolution to GitHub APIs in this order: 1. the reviewer's latest apache/kafka commit author email 2. `gh search commits` for prior `Reviewers:` trailers by display name, accepted only when the matched commit's associated PR has a review from the same GitHub login 3. the reviewer's GitHub public profile email 4. `Name (github:login)` fallback, without tagging the reviewer It also skips `pr-reviewed.yml` artifact creation once the PR is no longer open. For the motivating #22108 case, this resolves `@mimaison` as `Mickael Maison <mimaison@apache.org>` instead of falling back to the GitHub handle. `gh search commits` uses GitHub's Search API bucket. In CI it is authenticated via `GH_TOKEN`, so the limit is 30 requests/minute (unauthenticated is 10 requests/minute). This workflow normally resolves one reviewer per run. In local checks, `mjsax` and `mingyen066` resolved via T1 in 0.5-0.8s, while `mimaison` and `UladzislauBlok` resolved via T2's search-plus-review verification in about 3.0s. Reviewers: Chia-Ping Tsai <chia7712@gmail.com>
1 parent d02723a commit f47b9a4

2 files changed

Lines changed: 83 additions & 65 deletions

File tree

.github/scripts/pr-format.py

Lines changed: 82 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -107,86 +107,102 @@ def split_paragraphs(text: str):
107107
def resolve_reviewer(login: str) -> tuple:
108108
"""Map a GitHub login to (name, email).
109109
110-
Tries three tiers in order: repo commit history, GitHub user profile,
111-
and past `Reviewers:` trailers in git log (matched by name).
112-
Noreply emails (@users.noreply.github.com) are treated as missing since
113-
they are GitHub privacy placeholders that do not identify the reviewer.
114-
Returns (name, None) when no usable email is found; the caller falls
115-
back to the '(@login)' form in the Reviewers trailer.
110+
Tries reviewer email sources in order: repo commit author email, past
111+
`Reviewers:` trailers searched via GitHub commit search API (matched
112+
by name and verified by PR review login), and GitHub user profile
113+
public email. Noreply emails (@users.noreply.github.com) are treated
114+
as missing since they are GitHub privacy placeholders that do not
115+
identify the reviewer. Returns (name, None) when no usable email is
116+
found; the caller falls back to the '(github:login)' form in the
117+
Reviewers trailer.
116118
"""
117119
def _usable_email(e):
118120
if not e or e.endswith("@users.noreply.github.com"):
119121
return None
120122
return e
121123

122-
name = None
123-
email = None
124-
125-
# Tier 1: find from repo commit history. Misses when the reviewer has no
126-
# merged commit in apache/kafka, or had "Keep my email private" enabled
127-
# at commit time (GitHub rewrites the author to the noreply form).
128-
try:
129-
cmd = f"gh api repos/apache/kafka/commits?author={login}&per_page=1"
130-
p = subprocess.run(shlex.split(cmd), capture_output=True, text=True)
131-
if p.returncode == 0:
132-
commits = json.loads(p.stdout)
133-
if commits:
134-
author = commits[0].get("commit", {}).get("author", {})
135-
name = author.get("name")
136-
email = _usable_email(author.get("email"))
137-
except Exception as e:
138-
logger.debug(f"Failed to resolve {login} from commit history: {e}")
139-
140-
# Tier 2: GitHub user profile. Only exposes an email when the reviewer
141-
# has set a Public email in their profile settings.
142-
if not name or not email:
143-
try:
144-
cmd = f"gh api users/{login}"
145-
p = subprocess.run(shlex.split(cmd), capture_output=True, text=True)
146-
if p.returncode == 0:
147-
user = json.loads(p.stdout)
148-
if not name:
149-
name = user.get("name")
150-
if not email:
151-
email = _usable_email(user.get("email"))
152-
except Exception as e:
153-
logger.debug(f"Failed to resolve {login} from GitHub profile: {e}")
154-
155-
# Tier 3: past Reviewers: trailers in git log, matched by name. Catches
156-
# pure reviewers (no commits in apache/kafka, no public profile email)
157-
# who have been credited with a real email in an earlier merged PR.
158-
# git log is newest-first, so the first usable match is the most recent.
159-
if name and not email:
124+
def _run_json(cmd, source):
160125
try:
161-
p = subprocess.run(
162-
["git", "log",
163-
"--pretty=format:%(trailers:key=Reviewers,valueonly=true,unfold=true)"],
164-
capture_output=True, text=True,
165-
)
126+
p = subprocess.run(cmd, capture_output=True, text=True)
166127
if p.returncode == 0:
167-
pattern = re.compile(rf"{re.escape(name)}\s*<([^>]+)>")
168-
for line in p.stdout.splitlines():
169-
for m in pattern.finditer(line):
170-
candidate = _usable_email(m.group(1))
171-
if candidate:
172-
email = candidate
173-
break
174-
if email:
175-
break
128+
return json.loads(p.stdout)
129+
logger.debug(f"Failed to resolve {login} from {source}: {p.stderr}")
176130
except Exception as e:
177-
logger.debug(f"Failed to resolve {login} from past Reviewers trailers: {e}")
178-
179-
if not name:
180-
name = login
131+
logger.debug(f"Failed to resolve {login} from {source}: {e}")
132+
return None
181133

182-
return (name, email)
134+
def _has_pr_review_from_login(commit_sha):
135+
pulls = _run_json(["gh", "api", f"repos/apache/kafka/commits/{commit_sha}/pulls"],
136+
f"associated PRs for commit {commit_sha}") or []
137+
for pull in pulls:
138+
pr_number = pull.get("number")
139+
if not pr_number:
140+
continue
141+
reviews = _run_json(["gh", "api", f"repos/apache/kafka/pulls/{pr_number}/reviews?per_page=100"],
142+
f"reviews for PR {pr_number}") or []
143+
if any((review.get("user") or {}).get("login", "").lower() == login.lower()
144+
for review in reviews):
145+
return True
146+
return False
147+
148+
commits = _run_json(["gh", "api", f"repos/apache/kafka/commits?author={login}&per_page=1"],
149+
"commit history") or []
150+
author = commits[0].get("commit", {}).get("author", {}) if commits else {}
151+
152+
# Tier 1: latest repo commit authored by this GitHub login. Misses
153+
# when the reviewer has no merged commit in apache/kafka, or had
154+
# "Keep my email private" enabled at commit time (GitHub rewrites
155+
# the author to the noreply form).
156+
email = _usable_email(author.get("email"))
157+
if email:
158+
return (author.get("name") or login, email)
159+
160+
user = _run_json(["gh", "api", f"users/{login}"], "GitHub profile") or {}
161+
162+
name_candidates = []
163+
for candidate in (user.get("name"), author.get("name"), login):
164+
if candidate and candidate not in name_candidates:
165+
name_candidates.append(candidate)
166+
167+
name = name_candidates[0] if name_candidates else login
168+
169+
# Tier 2: past Reviewers: trailers in commit history, matched by name,
170+
# via the GitHub commit search API. Catches pure reviewers (no commits
171+
# in apache/kafka, no public profile email) who have been credited
172+
# with a real email in an earlier merged PR. Sort by committer-date
173+
# desc so the most recent email wins if a reviewer has changed it.
174+
# Full-text search is tokenized (not strict substring), so we re-verify
175+
# with a regex client-side. To avoid same-name matches, we only accept
176+
# a trailer email when the matched commit's associated PR includes a
177+
# review from this GitHub login.
178+
for candidate in name_candidates:
179+
results = _run_json(["gh", "search", "commits",
180+
"--repo", "apache/kafka",
181+
f'"{candidate} <"',
182+
"--limit", "10",
183+
"--sort", "committer-date",
184+
"--order", "desc",
185+
"--json", "sha,commit"],
186+
"commit search") or []
187+
pattern = re.compile(rf"{re.escape(candidate)}\s*<([^>]+)>")
188+
for result in results:
189+
msg = result.get("commit", {}).get("message", "")
190+
commit_sha = result.get("sha")
191+
for match in pattern.finditer(msg):
192+
candidate_email = _usable_email(match.group(1))
193+
if candidate_email and commit_sha and _has_pr_review_from_login(commit_sha):
194+
return (candidate, candidate_email)
195+
196+
# Tier 3: GitHub user profile. Only exposes an email when the reviewer
197+
# has set a Public email in their profile settings.
198+
return (name, _usable_email(user.get("email")))
183199

184200

185201
def already_exists(identity: str, existing_reviewers: List[str]) -> bool:
186202
"""Check if a reviewer identity is already in the existing reviewers list.
187203
188204
identity is the delimited token that uniquely identifies a reviewer, either
189-
'<email>' (for the email form) or '(@login)' (for the login fallback).
205+
'<email>' (for the email form) or '(github:login)' (for the login fallback).
190206
"""
191207
return identity.lower() in ", ".join(existing_reviewers).lower()
192208

@@ -246,7 +262,8 @@ def update_reviewers_trailer(body: str, trailer: str) -> str:
246262
if email:
247263
identity = f"<{email}>"
248264
else:
249-
identity = f"(@{reviewer_login})"
265+
# Tier 4: fall back to the GitHub handle without tagging the reviewer.
266+
identity = f"(github:{reviewer_login})"
250267
resolved = f"{name} {identity}"
251268
existing_reviewers = parse_trailers(title, body).get("Reviewers", [])
252269
if not already_exists(identity, existing_reviewers):

.github/workflows/pr-reviewed.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ jobs:
2929
save-pr-number:
3030
name: Save PR Number
3131
runs-on: ubuntu-latest
32+
if: github.event.pull_request.state == 'open'
3233
steps:
3334
- name: Env
3435
run: printenv

0 commit comments

Comments
 (0)