Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit143ffbf

Browse files
committed
Admin/XMover: Add problematic translog detection and cancellation
The new command can find and cancel replica shards with largeuncommitted translog sizes. It provides options to set size thresholdsand optionally execute cancel commands after user confirmation.
1 parent256ff3d commit143ffbf

File tree

4 files changed

+422
-1
lines changed

4 files changed

+422
-1
lines changed

‎cratedb_toolkit/admin/xmover/analysis/shard.py‎

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -838,6 +838,106 @@ def plan_node_decommission(self, node_name: str, min_free_space_gb: float = 100.
838838
}
839839

840840

841+
classTranslogReporter:
842+
def__init__(self,client:CrateDBClient):
843+
self.client=client
844+
845+
defproblematic_translogs(self,size_mb:int)->List[str]:
846+
"""Find and optionally cancel shards with problematic translog sizes."""
847+
console.print(Panel.fit("[bold blue]Problematic Translog Analysis[/bold blue]"))
848+
console.print(f"[dim]Looking for replica shards with translog uncommitted size >{size_mb}MB[/dim]")
849+
console.print()
850+
851+
# Query to find problematic replica shards
852+
query="""
853+
SELECT sh.schema_name,\
854+
sh.table_name,\
855+
translate(p.values::text, ':{}', '=()') as partition_values,\
856+
sh.id AS shard_id,\
857+
node['name'] as node_name,\
858+
sh.translog_stats['uncommitted_size'] / 1024^2 AS translog_uncommitted_mb
859+
FROM
860+
sys.shards AS sh
861+
LEFT JOIN information_schema.table_partitions p
862+
ON sh.table_name = p.table_name
863+
AND sh.schema_name = p.table_schema
864+
AND sh.partition_ident = p.partition_ident
865+
WHERE
866+
sh.state = 'STARTED'
867+
AND sh.translog_stats['uncommitted_size']\
868+
> ? * 1024^2
869+
AND primary = FALSE
870+
ORDER BY
871+
6 DESC\
872+
"""
873+
874+
try:
875+
result=self.client.execute_query(query, [size_mb])
876+
rows=result.get("rows", [])
877+
878+
ifnotrows:
879+
console.print(f"[green]✓ No replica shards found with translog uncommitted size >{size_mb}MB[/green]")
880+
return []
881+
882+
console.print(f"Found{len(rows)} shards with problematic translogs:")
883+
console.print()
884+
885+
# Display query results table
886+
results_table=Table(title=f"Problematic Replica Shards (translog >{size_mb}MB)",box=box.ROUNDED)
887+
results_table.add_column("Schema",style="cyan")
888+
results_table.add_column("Table",style="blue")
889+
results_table.add_column("Partition",style="magenta")
890+
results_table.add_column("Shard ID",justify="right",style="yellow")
891+
results_table.add_column("Node",style="green")
892+
results_table.add_column("Translog MB",justify="right",style="red")
893+
894+
forrowinrows:
895+
schema_name,table_name,partition_values,shard_id,node_name,translog_mb=row
896+
partition_display= (
897+
partition_valuesifpartition_valuesandpartition_values!="NULL"else"[dim]none[/dim]"
898+
)
899+
results_table.add_row(
900+
schema_name,table_name,partition_display,str(shard_id),node_name,f"{translog_mb:.1f}"
901+
)
902+
903+
console.print(results_table)
904+
console.print()
905+
console.print("[bold]Generated ALTER Commands:[/bold]")
906+
console.print()
907+
908+
# Generate ALTER commands
909+
alter_commands= []
910+
forrowinrows:
911+
schema_name,table_name,partition_values,shard_id,node_name,translog_mb=row
912+
913+
# Build the ALTER command based on whether it's partitioned
914+
ifpartition_valuesandpartition_values!="NULL":
915+
# partition_values already formatted like ("sync_day"=1757376000000) from the translate function
916+
alter_cmd= (
917+
f'ALTER TABLE "{schema_name}"."{table_name}" partition{partition_values} '
918+
f"REROUTE CANCEL SHARD{shard_id} on '{node_name}' WITH (allow_primary=False);"
919+
)
920+
else:
921+
alter_cmd= (
922+
f'ALTER TABLE "{schema_name}"."{table_name}" '
923+
f"REROUTE CANCEL SHARD{shard_id} on '{node_name}' WITH (allow_primary=False);"
924+
)
925+
926+
alter_commands.append(alter_cmd)
927+
console.print(alter_cmd)
928+
929+
console.print()
930+
console.print(f"[bold]Total:{len(alter_commands)} ALTER commands generated[/bold]")
931+
returnalter_commands
932+
933+
exceptExceptionase:
934+
console.print(f"[red]Error analyzing problematic translogs:{e}[/red]")
935+
importtraceback
936+
937+
console.print(f"[dim]{traceback.format_exc()}[/dim]")
938+
return []
939+
940+
841941
classShardReporter:
842942
def__init__(self,analyzer:ShardAnalyzer):
843943
self.analyzer=analyzer

‎cratedb_toolkit/admin/xmover/cli.py‎

Lines changed: 69 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,12 @@
1212
fromrich.consoleimportConsole
1313
fromrich.panelimportPanel
1414

15-
fromcratedb_toolkit.admin.xmover.analysis.shardimportActiveShardMonitor,ShardAnalyzer,ShardReporter
15+
fromcratedb_toolkit.admin.xmover.analysis.shardimport (
16+
ActiveShardMonitor,
17+
ShardAnalyzer,
18+
ShardReporter,
19+
TranslogReporter,
20+
)
1621
fromcratedb_toolkit.admin.xmover.analysis.tableimportDistributionAnalyzer
1722
fromcratedb_toolkit.admin.xmover.analysis.zoneimportZoneReport
1823
fromcratedb_toolkit.admin.xmover.modelimport (
@@ -444,6 +449,69 @@ def run_single_analysis():
444449
console.print(f"[dim]{traceback.format_exc()}[/dim]")
445450

446451

452+
@main.command()
453+
@click.option("--size-mb",default=300,help="Minimum translog uncommitted size in MB (default: 300)")
454+
@click.option("--cancel",is_flag=True,help="Execute the cancel commands after confirmation")
455+
@click.pass_context
456+
defproblematic_translogs(ctx,size_mb:int,cancel:bool):
457+
"""
458+
Find and optionally cancel shards with problematic translog sizes.
459+
460+
This command identifies replica shards with large uncommitted translog sizes
461+
that may indicate replication issues. By default, it shows the ALTER commands
462+
that would cancel these shards. With --cancel, it executes them after confirmation.
463+
"""
464+
client=ctx.obj["client"]
465+
report=TranslogReporter(client=client)
466+
alter_commands=report.problematic_translogs(size_mb=size_mb)
467+
468+
try:
469+
ifcancelandalter_commands:
470+
console.print()
471+
console.print("[yellow]⚠️ WARNING: This will cancel the specified shards![/yellow]")
472+
console.print("[yellow]This may cause temporary data unavailability for these shards.[/yellow]")
473+
console.print()
474+
475+
ifclick.confirm("Are you sure you want to execute these ALTER commands?"):
476+
console.print()
477+
console.print("[bold blue]Executing ALTER commands...[/bold blue]")
478+
479+
executed=0
480+
failed=0
481+
482+
fori,cmdinenumerate(alter_commands,1):
483+
ifcmd.startswith("--"):
484+
console.print(f"[yellow]Skipping command{i} (parse error):{cmd}[/yellow]")
485+
continue
486+
487+
try:
488+
console.print(f"[dim]({i}/{len(alter_commands)}) Executing...[/dim]")
489+
client.execute_query(cmd)
490+
console.print(f"[green]✓ Command{i} executed successfully[/green]")
491+
executed+=1
492+
exceptExceptionase:
493+
console.print(f"[red]✗ Command{i} failed:{e}[/red]")
494+
failed+=1
495+
496+
# Small delay between commands to avoid overwhelming the cluster
497+
ifi<len(alter_commands):
498+
time.sleep(1)
499+
500+
console.print()
501+
console.print("[bold]Execution Summary:[/bold]")
502+
console.print(f"[green]✓ Successful:{executed}[/green]")
503+
iffailed>0:
504+
console.print(f"[red]✗ Failed:{failed}[/red]")
505+
else:
506+
console.print("[yellow]Operation cancelled by user[/yellow]")
507+
508+
exceptExceptionase:
509+
console.print(f"[red]Error analyzing problematic translogs:{e}[/red]")
510+
importtraceback
511+
512+
console.print(f"[dim]{traceback.format_exc()}[/dim]")
513+
514+
447515
@main.command()
448516
@click.option("--table","-t",help="Analyze zones for specific table only")
449517
@click.option("--show-shards/--no-show-shards",default=False,help="Show individual shard details (default: False)")

‎doc/admin/xmover/handbook.md‎

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -245,6 +245,47 @@ xmover monitor-recovery --watch --include-transitioning
245245
-**DISK**: Rebuilding shard from local data (after restart/disk issues)
246246

247247

248+
###`problematic-translogs`
249+
Find and optionally cancel replica shards with problematic translog sizes.
250+
251+
**Options:**
252+
-`--size-mb INTEGER`: Minimum translog uncommitted size in MB (default: 300)
253+
-`--cancel`: Execute the cancel commands after confirmation
254+
255+
**Description:**
256+
This command identifies replica shards with large uncommitted translog sizes that may indicate replication issues. By default, it shows the ALTER commands that would cancel these shards. With`--cancel`, it executes them after confirmation.
257+
258+
**Examples:**
259+
```bash
260+
# Show problematic shards with translog > 300MB (default)
261+
xmover problematic-translogs
262+
263+
# Show shards with translog > 500MB
264+
xmover problematic-translogs --size-mb 500
265+
266+
# Execute cancel commands for shards > 1GB after confirmation
267+
xmover problematic-translogs --size-mb 1000 --cancel
268+
```
269+
270+
**Sample Output:**
271+
```
272+
Found 3 shards with problematic translogs:
273+
Problematic Replica Shards (translog > 300MB)
274+
╭────────┬───────────────────────────────┬────────────────────────────┬──────────┬────────────┬─────────────╮
275+
│ Schema │ Table │ Partition │ Shard ID │ Node │ Translog MB │
276+
├────────┼───────────────────────────────┼────────────────────────────┼──────────┼────────────┼─────────────┤
277+
│ TURVO │ shipmentFormFieldData │ none │ 14 │ data-hot-6 │ 7040.9 │
278+
│ TURVO │ shipmentFormFieldData_events │ ("sync_day"=1757376000000) │ 3 │ data-hot-2 │ 481.2 │
279+
│ TURVO │ orderFormFieldData │ none │ 5 │ data-hot-1 │ 469.5 │
280+
╰────────┴───────────────────────────────┴────────────────────────────┴──────────┴────────────┴─────────────╯
281+
Generated ALTER Commands:
282+
ALTER TABLE "TURVO"."shipmentFormFieldData" REROUTE CANCEL SHARD 14 on 'data-hot-6' WITH (allow_primary=False);
283+
ALTER TABLE "TURVO"."shipmentFormFieldData_events" partition ("sync_day"=1757376000000) REROUTE CANCEL SHARD 3 on 'data-hot-2' WITH (allow_primary=False);
284+
ALTER TABLE "TURVO"."orderFormFieldData" REROUTE CANCEL SHARD 5 on 'data-hot-1' WITH (allow_primary=False);
285+
Total: 3 ALTER commands generated
286+
```
287+
288+
248289
###`active-shards`
249290
Monitor the most active shards by tracking checkpoint progression over time.
250291
This command helps identify which shards are receiving the most write activity

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp