Mean_Flow_DNA/models.py at main · Utah-Math-Data-Science/Mean_Flow_DNA · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import torch.nn as nn
import torch

# import copy


class MLPModel(nn.Module):
    def __init__(self, args, alphabet_size):
        """
        A simplified MLP model that always outputs a 'related simplex' (logits).
        Designed to be used as density model p(v) in frameworks like Argmax Flows.

        Args:
            args: Configuration object containing model hyperparameters (e.g., hidden_dim).
            alphabet_size (int): The number of categories/categories per sequence position.
        """
        super().__init__()
        self.alphabet_size = alphabet_size
        self.args = args

        # Projection for time embeddings
        self.time_embedder = nn.Sequential(
            GaussianFourierProjection(embed_dim=args.hidden_dim),
            nn.Linear(args.hidden_dim, args.hidden_dim),
            nn.ReLU()
        )

        # Projection for the input sequence.

        self.input_expansion = 2
        self.embedder = nn.Linear(self.input_expansion * alphabet_size, args.hidden_dim)

        # The core MLP. Input is [hidden_dim (from sequence) + hidden_dim (t) + hidden_dim (r)]
        # Output is logits for each position (alphabet_size)
        self.mlp = nn.Sequential(
            nn.Linear(args.hidden_dim + 2 * args.hidden_dim, args.hidden_dim),
            nn.LayerNorm(args.hidden_dim),  # Add this
            nn.Dropout(0.1),
            nn.ReLU(),
            nn.Linear(args.hidden_dim, args.hidden_dim),
            nn.LayerNorm(args.hidden_dim),  # Add this
            nn.Dropout(0.1),
            nn.ReLU(),
            nn.Linear(args.hidden_dim, alphabet_size)
        )


    def forward(self, x, t, r, cls=None):
        """
        Args:
            x: [batch, seq_len, alphabet_size * expansion]
            t: [batch] (current time)
            r: [batch] (previous time)
            cls: optional [batch] class labels
        """
        # Time embeddings for both t and r
        t_embed = self.time_embedder(t)  # [batch, hidden_dim]
        r_embed = self.time_embedder(r)  # [batch, hidden_dim]

        feat = self.embedder(x)  # [batch, seq_len, hidden_dim]

        feat = feat + t_embed.unsqueeze(1) + r_embed.unsqueeze(1)  # [batch, seq_len, hidden_dim]

        # Prepare MLP input with full context
        mlp_input = torch.cat([
            feat,
            t_embed.unsqueeze(1).expand(-1, x.size(1), -1),
            r_embed.unsqueeze(1).expand(-1, x.size(1), -1)
        ], dim=-1)  # [batch, seq_len, hidden_dim * 3]

        # Process through MLP
        output = self.mlp(mlp_input)  # [batch, seq_len, output_dim]
        return output  # [batch, seq_len, alphabet_size]


class GaussianFourierProjection(nn.Module):
    """
    Gaussian random features for encoding time steps.
    """
    def __init__(self, embed_dim, scale=30.):
        super().__init__()
        # Randomly sample weights during initialization. These weights are fixed
        # during optimization and are not trainable.
        self.W = nn.Parameter(torch.randn(embed_dim // 2) * scale, requires_grad=False)

    def forward(self, x):
        x_proj = x[:, None] * self.W[None, :] * 2 * torch.pi
        return torch.cat([torch.sin(x_proj), torch.cos(x_proj)], dim=-1)