IMHO you'd do better to put the images & captions into floating tables, so you can ensure they always move together, using something like:
Sub AddImageCaptionTables()
Dim iShp As InlineShape, Rng As Range, Tbl As Table
Dim i As Long, PicWdth As Single, PicHght As Single, VPos As Single
Dim HPos As Single, VRel As Long, HRel As Long, BShp As Boolean
With ActiveDocument
  For i = 1 To .InlineShapes.Count
    If .InlineShapes(i).Range.Information(wdWithInTable) = False Then
      PicWdth = .InlineShapes(i).Width
      Set Rng = .InlineShapes(i).Range
      With Rng
        If .Characters.Last.Next.Text = vbCr Then .Characters.Last.Next.Text = vbNullString
        PicWdth = .InlineShapes(1).Width
        PicHght = .InlineShapes(1).Height
        .InlineShapes(1).Range.Cut
      End With
      BShp = False: VRel = 0: HRel = 0: VPos = 0: HPos = 0
      Call MakeImageTable(Rng, PicWdth, PicHght, BShp, VRel, HRel, VPos, HPos)
    End If
  Next
  While .Shapes.Count > 0
    BShp = True
    With .Shapes(1)
      PicWdth = .Width
      PicHght = .Height
      VRel = .RelativeVerticalPosition
      HRel = .RelativeHorizontalPosition
      VPos = .Top
      HPos = .Left
      Set iShp = .ConvertToInlineShape
    End With
    With iShp
      Set Rng = .Range
      .Range.Cut
    End With
    Call MakeImageTable(Rng, PicWdth, PicHght, BShp, VRel, HRel, VPos, HPos)
  Wend
End With
End Sub
 
Sub MakeImageTable(Rng As Range, PicWdth As Single, PicHght As Single, BShp As Boolean, _
  VRel As Long, HRel As Long, VPos As Single, HPos As Single)
Dim Tbl As Table
'Create & format the table
Set Tbl = Rng.Tables.Add(Range:=Rng, Numrows:=2, NumColumns:=1)
With Tbl
  .Borders.Enable = True
  .Columns.Width = PicWdth
  .TopPadding = 0
  .BottomPadding = 0
  .LeftPadding = 0
  .RightPadding = 0
  .Spacing = 0
  .Rows(1).HeightRule = wdRowHeightExactly
  .Rows(1).Height = PicHght
  With .Rows
    .LeftIndent = 0
    If BShp = True Then
      .WrapAroundText = True
      .HorizontalPosition = HPos
      .RelativeHorizontalPosition = HRel
      .VerticalPosition = VPos
      .RelativeVerticalPosition = VRel
      .AllowOverlap = False
    End If
  End With
  With .Cell(1, 1).Range
    With .ParagraphFormat
      .SpaceBefore = 0
      .SpaceAfter = 0
      .LeftIndent = 0
      .RightIndent = 0
      .FirstLineIndent = 0
      .KeepWithNext = True
    End With
    .Paste
  End With
  With .Cell(2, 1).Range
    .Style = "Caption"
    .End = .End - 1
    .InsertAfter vbCr
    .InsertCaption Label:="Figure", TitleAutoText:=" ", Title:="", _
      Position:=wdCaptionPositionBelow, ExcludeLabel:=0
    .Paragraphs.First.Range.Characters.Last.Text = vbNullString
    .Borders(wdBorderLeft).LineStyle = wdLineStyleNone
    .Borders(wdBorderRight).LineStyle = wdLineStyleNone
    .Borders(wdBorderTop).LineStyle = wdLineStyleNone
  End With
End With
End Sub
The above code embeds all Shapes & InlineShapes in a document in tables as InlineShapes with a row for Captions. Whatever positioning applied to the original Shape object will apply to the table also. This arrangement has the advantage that captions in floating tables will show up in a Table of Figures, whereas captions in textboxes don't.

To have the caption row appear above the image, instead of below, change the Rows(1) references to Rows(2), change the Cell(1, 1) references to Cell(2, 1) and change the change the Cell(2, 1) references to Cell(1, 1).

To work with just the selected image (or multiple images in a selected range), change:
With ActiveDocument
to:
With Selection
and change:
  While .Shapes.Count > 0
    With .Shapes(1)
to:
  While .ShapeRange.Count > 0
    With .ShapeRange(1)

I haven't included code for the user to type in the caption while the code is running. That can be done post-reformatting and would be too error-prone (IMHO) if the code is processing multiple images.