Advertisement
teknoraver

make net.core.{r,w}mem_{default,max} namespaced

May 4th, 2018
402
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Diff 15.15 KB | None | 0 0
  1. From 0988b2101bb26c0ea174d01e8e1dba7a6c809f22 Mon Sep 17 00:00:00 2001
  2. From: Matteo Croce <mcroce@redhat.com>
  3. Date: Wed, 26 Jul 2017 19:03:33 +0200
  4. Subject: [PATCH] net: make net.core.{r,w}mem_{default,max} namespaced
  5.  
  6. The following sysctl are global and can't be read or set from a netns:
  7.  
  8. net.core.rmem_default
  9. net.core.rmem_max
  10. net.core.wmem_default
  11. net.core.wmem_max
  12.  
  13. Make the following sysctl parameters available from within a network
  14. namespace, allowing to set unique values per network namespace.
  15.  
  16. My concern is about the initial value of this sysctl in the newly
  17. creates netns: I'm not sure if is better to copy them from the init
  18. namespace or set them to the default values.
  19.  
  20. Setting them to the default value has the advantage that a new namespace
  21. behaves like a freshly booted system, while copying them from the init
  22. netns has the advantage of keeping the current behaviour as the values
  23. from the init netns are used.
  24.  
  25. Signed-off-by: Matteo Croce <mcroce@redhat.com>
  26. ---
  27. include/net/netns/core.h        |  5 +++
  28.  include/net/sock.h              |  3 --
  29.  include/net/tcp.h               |  3 +-
  30.  net/core/net_namespace.c        | 22 +++++++++++++
  31.  net/core/sock.c                 | 21 +++++--------
  32.  net/core/sysctl_net_core.c      | 70 ++++++++++++++++++++++-------------------
  33.  net/ipv4/ip_output.c            |  2 +-
  34.  net/ipv4/syncookies.c           |  3 +-
  35.  net/ipv4/tcp_minisocks.c        |  3 +-
  36.  net/ipv4/tcp_output.c           | 12 ++++---
  37.  net/ipv6/syncookies.c           |  3 +-
  38.  net/netfilter/ipvs/ip_vs_sync.c |  4 +--
  39.  12 files changed, 89 insertions(+), 62 deletions(-)
  40.  
  41. diff --git a/include/net/netns/core.h b/include/net/netns/core.h
  42. index 36c2d998a43c..71a57abf8d70 100644
  43. --- a/include/net/netns/core.h
  44. +++ b/include/net/netns/core.h
  45. @@ -10,6 +10,11 @@ struct netns_core {
  46.     struct ctl_table_header *sysctl_hdr;
  47.  
  48.     int sysctl_somaxconn;
  49. +   u32 sysctl_wmem_max;
  50. +   u32 sysctl_rmem_max;
  51. +
  52. +   u32 sysctl_wmem_default;
  53. +   u32 sysctl_rmem_default;
  54.  
  55.  #ifdef CONFIG_PROC_FS
  56.     int __percpu *sock_inuse;
  57. diff --git a/include/net/sock.h b/include/net/sock.h
  58. index 74d725fdbe0f..b7899813cc87 100644
  59. --- a/include/net/sock.h
  60. +++ b/include/net/sock.h
  61. @@ -2363,9 +2363,6 @@ void sk_get_meminfo(const struct sock *sk, u32 *meminfo);
  62.  #define SK_WMEM_MAX        (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
  63.  #define SK_RMEM_MAX        (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
  64.  
  65. -extern __u32 sysctl_wmem_max;
  66. -extern __u32 sysctl_rmem_max;
  67. -
  68.  extern int sysctl_tstamp_allow_data;
  69.  extern int sysctl_optmem_max;
  70.  
  71. diff --git a/include/net/tcp.h b/include/net/tcp.h
  72. index 9c9b3768b350..ecae987a0f80 100644
  73. --- a/include/net/tcp.h
  74. +++ b/include/net/tcp.h
  75. @@ -1279,7 +1279,8 @@ static inline void tcp_slow_start_after_idle_check(struct sock *sk)
  76.  void tcp_select_initial_window(const struct sock *sk, int __space,
  77.                    __u32 mss, __u32 *rcv_wnd,
  78.                    __u32 *window_clamp, int wscale_ok,
  79. -                  __u8 *rcv_wscale, __u32 init_rcv_wnd);
  80. +                  __u8 *rcv_wscale, __u32 init_rcv_wnd,
  81. +                  __u32 rmem_max);
  82.  
  83.  static inline int tcp_win_from_space(const struct sock *sk, int space)
  84.  {
  85. diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
  86. index a11e03f920d3..265f0c1de55f 100644
  87. --- a/net/core/net_namespace.c
  88. +++ b/net/core/net_namespace.c
  89. @@ -23,6 +23,16 @@
  90.  #include <net/net_namespace.h>
  91.  #include <net/netns/generic.h>
  92.  
  93. +/* Take into consideration the size of the struct sk_buff overhead in the
  94. + * determination of these values, since that is non-constant across
  95. + * platforms.  This makes socket queueing behavior and performance
  96. + * not depend upon such differences.
  97. + */
  98. +#define _SK_MEM_PACKETS        256
  99. +#define _SK_MEM_OVERHEAD   SKB_TRUESIZE(256)
  100. +#define SK_WMEM_MAX        (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
  101. +#define SK_RMEM_MAX        (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
  102. +
  103.  /*
  104.   * Our network namespace constructor/destructor lists
  105.   */
  106. @@ -340,6 +350,18 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
  107.  static int __net_init net_defaults_init_net(struct net *net)
  108.  {
  109.     net->core.sysctl_somaxconn = SOMAXCONN;
  110. +   if (net_eq(net, &init_net)) {
  111. +       init_net.core.sysctl_wmem_max = SK_WMEM_MAX;
  112. +       init_net.core.sysctl_rmem_max = SK_RMEM_MAX;
  113. +       init_net.core.sysctl_wmem_default = SK_WMEM_MAX;
  114. +       init_net.core.sysctl_rmem_default = SK_RMEM_MAX;
  115. +   } else {
  116. +       net->core.sysctl_wmem_max = init_net.core.sysctl_wmem_max;
  117. +       net->core.sysctl_rmem_max = init_net.core.sysctl_rmem_max;
  118. +       net->core.sysctl_wmem_default = init_net.core.sysctl_wmem_default;
  119. +       net->core.sysctl_rmem_default = init_net.core.sysctl_rmem_default;
  120. +   }
  121. +
  122.     return 0;
  123.  }
  124.  
  125. diff --git a/net/core/sock.c b/net/core/sock.c
  126. index 6444525f610c..a6eaf5abe997 100644
  127. --- a/net/core/sock.c
  128. +++ b/net/core/sock.c
  129. @@ -309,14 +309,6 @@ static struct lock_class_key af_wlock_keys[AF_MAX];
  130.  static struct lock_class_key af_elock_keys[AF_MAX];
  131.  static struct lock_class_key af_kern_callback_keys[AF_MAX];
  132.  
  133. -/* Run time adjustable parameters. */
  134. -__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
  135. -EXPORT_SYMBOL(sysctl_wmem_max);
  136. -__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
  137. -EXPORT_SYMBOL(sysctl_rmem_max);
  138. -__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
  139. -__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
  140. -
  141.  /* Maximal space eaten by iovec or ancillary data plus some space */
  142.  int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
  143.  EXPORT_SYMBOL(sysctl_optmem_max);
  144. @@ -694,6 +686,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
  145.             char __user *optval, unsigned int optlen)
  146.  {
  147.     struct sock *sk = sock->sk;
  148. +   struct net *net = sock_net(sk);
  149.     int val;
  150.     int valbool;
  151.     struct linger ling;
  152. @@ -747,7 +740,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
  153.          * play 'guess the biggest size' games. RCVBUF/SNDBUF
  154.          * are treated in BSD as hints
  155.          */
  156. -       val = min_t(u32, val, sysctl_wmem_max);
  157. +       val = min_t(u32, val, net->core.sysctl_wmem_max);
  158.  set_sndbuf:
  159.         sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
  160.         sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
  161. @@ -768,7 +761,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
  162.          * play 'guess the biggest size' games. RCVBUF/SNDBUF
  163.          * are treated in BSD as hints
  164.          */
  165. -       val = min_t(u32, val, sysctl_rmem_max);
  166. +       val = min_t(u32, val, net->core.sysctl_rmem_max);
  167.  set_rcvbuf:
  168.         sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
  169.         /*
  170. @@ -812,7 +805,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
  171.  
  172.     case SO_PRIORITY:
  173.         if ((val >= 0 && val <= 6) ||
  174. -           ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
  175. +           ns_capable(net->user_ns, CAP_NET_ADMIN))
  176.             sk->sk_priority = val;
  177.         else
  178.             ret = -EPERM;
  179. @@ -986,7 +979,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
  180.             clear_bit(SOCK_PASSSEC, &sock->flags);
  181.         break;
  182.     case SO_MARK:
  183. -       if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
  184. +       if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
  185.             ret = -EPERM;
  186.         else
  187.             sk->sk_mark = val;
  188. @@ -2759,8 +2752,8 @@ void sock_init_data(struct socket *sock, struct sock *sk)
  189.     timer_setup(&sk->sk_timer, NULL, 0);
  190.  
  191.     sk->sk_allocation   =   GFP_KERNEL;
  192. -   sk->sk_rcvbuf       =   sysctl_rmem_default;
  193. -   sk->sk_sndbuf       =   sysctl_wmem_default;
  194. +   sk->sk_rcvbuf       =   sock_net(sk)->core.sysctl_rmem_default;
  195. +   sk->sk_sndbuf       =   sock_net(sk)->core.sysctl_wmem_default;
  196.     sk->sk_state        =   TCP_CLOSE;
  197.     sk_set_socket(sk, sock);
  198.  
  199. diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
  200. index b1a2c5e38530..39b49545279f 100644
  201. --- a/net/core/sysctl_net_core.c
  202. +++ b/net/core/sysctl_net_core.c
  203. @@ -295,38 +295,6 @@ proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write,
  204.  
  205.  static struct ctl_table net_core_table[] = {
  206.  #ifdef CONFIG_NET
  207. -   {
  208. -       .procname   = "wmem_max",
  209. -       .data       = &sysctl_wmem_max,
  210. -       .maxlen     = sizeof(int),
  211. -       .mode       = 0644,
  212. -       .proc_handler   = proc_dointvec_minmax,
  213. -       .extra1     = &min_sndbuf,
  214. -   },
  215. -   {
  216. -       .procname   = "rmem_max",
  217. -       .data       = &sysctl_rmem_max,
  218. -       .maxlen     = sizeof(int),
  219. -       .mode       = 0644,
  220. -       .proc_handler   = proc_dointvec_minmax,
  221. -       .extra1     = &min_rcvbuf,
  222. -   },
  223. -   {
  224. -       .procname   = "wmem_default",
  225. -       .data       = &sysctl_wmem_default,
  226. -       .maxlen     = sizeof(int),
  227. -       .mode       = 0644,
  228. -       .proc_handler   = proc_dointvec_minmax,
  229. -       .extra1     = &min_sndbuf,
  230. -   },
  231. -   {
  232. -       .procname   = "rmem_default",
  233. -       .data       = &sysctl_rmem_default,
  234. -       .maxlen     = sizeof(int),
  235. -       .mode       = 0644,
  236. -       .proc_handler   = proc_dointvec_minmax,
  237. -       .extra1     = &min_rcvbuf,
  238. -   },
  239.     {
  240.         .procname   = "dev_weight",
  241.         .data       = &weight_p,
  242. @@ -536,6 +504,38 @@ static struct ctl_table netns_core_table[] = {
  243.         .extra1     = &zero,
  244.         .proc_handler   = proc_dointvec_minmax
  245.     },
  246. +   {
  247. +       .procname   = "wmem_max",
  248. +       .data       = &init_net.core.sysctl_wmem_max,
  249. +       .maxlen     = sizeof(int),
  250. +       .mode       = 0644,
  251. +       .proc_handler   = proc_dointvec_minmax,
  252. +       .extra1     = &min_sndbuf,
  253. +   },
  254. +   {
  255. +       .procname   = "rmem_max",
  256. +       .data       = &init_net.core.sysctl_rmem_max,
  257. +       .maxlen     = sizeof(int),
  258. +       .mode       = 0644,
  259. +       .proc_handler   = proc_dointvec_minmax,
  260. +       .extra1     = &min_rcvbuf,
  261. +   },
  262. +   {
  263. +       .procname   = "wmem_default",
  264. +       .data       = &init_net.core.sysctl_wmem_default,
  265. +       .maxlen     = sizeof(int),
  266. +       .mode       = 0644,
  267. +       .proc_handler   = proc_dointvec_minmax,
  268. +       .extra1     = &min_sndbuf,
  269. +   },
  270. +   {
  271. +       .procname   = "rmem_default",
  272. +       .data       = &init_net.core.sysctl_rmem_default,
  273. +       .maxlen     = sizeof(int),
  274. +       .mode       = 0644,
  275. +       .proc_handler   = proc_dointvec_minmax,
  276. +       .extra1     = &min_rcvbuf,
  277. +   },
  278.     { }
  279.  };
  280.  
  281. @@ -545,11 +545,15 @@ static __net_init int sysctl_core_net_init(struct net *net)
  282.  
  283.     tbl = netns_core_table;
  284.     if (!net_eq(net, &init_net)) {
  285. +       int i;
  286. +
  287.         tbl = kmemdup(tbl, sizeof(netns_core_table), GFP_KERNEL);
  288.         if (tbl == NULL)
  289.             goto err_dup;
  290.  
  291. -       tbl[0].data = &net->core.sysctl_somaxconn;
  292. +       /* Update the variables to point into the current struct net */
  293. +       for (i = 0; i < ARRAY_SIZE(netns_core_table) - 1; i++)
  294. +           tbl[i].data += (void *)net - (void *)&init_net;
  295.  
  296.         /* Don't export any sysctls to unprivileged users */
  297.         if (net->user_ns != &init_user_ns) {
  298. diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
  299. index 83c73bab2c3d..d61f03beb015 100644
  300. --- a/net/ipv4/ip_output.c
  301. +++ b/net/ipv4/ip_output.c
  302. @@ -1569,7 +1569,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
  303.     sk->sk_priority = skb->priority;
  304.     sk->sk_protocol = ip_hdr(skb)->protocol;
  305.     sk->sk_bound_dev_if = arg->bound_dev_if;
  306. -   sk->sk_sndbuf = sysctl_wmem_default;
  307. +   sk->sk_sndbuf = net->core.sysctl_wmem_default;
  308.     sk->sk_mark = fl4.flowi4_mark;
  309.     err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base,
  310.                  len, 0, &ipc, &rt, MSG_DONTWAIT);
  311. diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
  312. index c3387dfd725b..0f1c85b9b9fc 100644
  313. --- a/net/ipv4/syncookies.c
  314. +++ b/net/ipv4/syncookies.c
  315. @@ -390,7 +390,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
  316.     tcp_select_initial_window(sk, tcp_full_space(sk), req->mss,
  317.                   &req->rsk_rcv_wnd, &req->rsk_window_clamp,
  318.                   ireq->wscale_ok, &rcv_wscale,
  319. -                 dst_metric(&rt->dst, RTAX_INITRWND));
  320. +                 dst_metric(&rt->dst, RTAX_INITRWND),
  321. +                 sock_net(sk)->core.sysctl_rmem_max);
  322.  
  323.     ireq->rcv_wscale  = rcv_wscale;
  324.     ireq->ecn_ok = cookie_ecn_ok(&tcp_opt, sock_net(sk), &rt->dst);
  325. diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
  326. index 57b5468b5139..1960eadb15b5 100644
  327. --- a/net/ipv4/tcp_minisocks.c
  328. +++ b/net/ipv4/tcp_minisocks.c
  329. @@ -383,7 +383,8 @@ void tcp_openreq_init_rwin(struct request_sock *req,
  330.         &req->rsk_window_clamp,
  331.         ireq->wscale_ok,
  332.         &rcv_wscale,
  333. -       rcv_wnd);
  334. +       rcv_wnd,
  335. +       sock_net(sk_listener)->core.sysctl_rmem_max);
  336.     ireq->rcv_wscale = rcv_wscale;
  337.  }
  338.  EXPORT_SYMBOL(tcp_openreq_init_rwin);
  339. diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
  340. index 383cac0ff0ec..d1730a188a75 100644
  341. --- a/net/ipv4/tcp_output.c
  342. +++ b/net/ipv4/tcp_output.c
  343. @@ -191,7 +191,7 @@ u32 tcp_default_init_rwnd(u32 mss)
  344.  void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
  345.                    __u32 *rcv_wnd, __u32 *window_clamp,
  346.                    int wscale_ok, __u8 *rcv_wscale,
  347. -                  __u32 init_rcv_wnd)
  348. +                  __u32 init_rcv_wnd, __u32 rmem_max)
  349.  {
  350.     unsigned int space = (__space < 0 ? 0 : __space);
  351.  
  352. @@ -221,7 +221,7 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
  353.     if (wscale_ok) {
  354.         /* Set window scaling on max possible window */
  355.         space = max_t(u32, space, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
  356. -       space = max_t(u32, space, sysctl_rmem_max);
  357. +       space = max_t(u32, space, rmem_max);
  358.         space = min_t(u32, space, *window_clamp);
  359.         while (space > U16_MAX && (*rcv_wscale) < TCP_MAX_WSCALE) {
  360.             space >>= 1;
  361. @@ -3288,6 +3288,7 @@ static void tcp_connect_init(struct sock *sk)
  362.  {
  363.     const struct dst_entry *dst = __sk_dst_get(sk);
  364.     struct tcp_sock *tp = tcp_sk(sk);
  365. +   struct net *net = sock_net(sk);
  366.     __u8 rcv_wscale;
  367.     u32 rcv_wnd;
  368.  
  369. @@ -3295,7 +3296,7 @@ static void tcp_connect_init(struct sock *sk)
  370.      * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
  371.      */
  372.     tp->tcp_header_len = sizeof(struct tcphdr);
  373. -   if (sock_net(sk)->ipv4.sysctl_tcp_timestamps)
  374. +   if (net->ipv4.sysctl_tcp_timestamps)
  375.         tp->tcp_header_len += TCPOLEN_TSTAMP_ALIGNED;
  376.  
  377.  #ifdef CONFIG_TCP_MD5SIG
  378. @@ -3331,9 +3332,10 @@ static void tcp_connect_init(struct sock *sk)
  379.                   tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
  380.                   &tp->rcv_wnd,
  381.                   &tp->window_clamp,
  382. -                 sock_net(sk)->ipv4.sysctl_tcp_window_scaling,
  383. +                 net->ipv4.sysctl_tcp_window_scaling,
  384.                   &rcv_wscale,
  385. -                 rcv_wnd);
  386. +                 rcv_wnd,
  387. +                 net->core.sysctl_rmem_max);
  388.  
  389.     tp->rx_opt.rcv_wscale = rcv_wscale;
  390.     tp->rcv_ssthresh = tp->rcv_wnd;
  391. diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
  392. index e997141aed8c..511f06533821 100644
  393. --- a/net/ipv6/syncookies.c
  394. +++ b/net/ipv6/syncookies.c
  395. @@ -249,7 +249,8 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
  396.     tcp_select_initial_window(sk, tcp_full_space(sk), req->mss,
  397.                   &req->rsk_rcv_wnd, &req->rsk_window_clamp,
  398.                   ireq->wscale_ok, &rcv_wscale,
  399. -                 dst_metric(dst, RTAX_INITRWND));
  400. +                 dst_metric(dst, RTAX_INITRWND),
  401. +                 sock_net(sk)->core.sysctl_rmem_max);
  402.  
  403.     ireq->rcv_wscale = rcv_wscale;
  404.     ireq->ecn_ok = cookie_ecn_ok(&tcp_opt, sock_net(sk), dst);
  405. diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
  406. index 001501e25625..7dc89ed15d3e 100644
  407. --- a/net/netfilter/ipvs/ip_vs_sync.c
  408. +++ b/net/netfilter/ipvs/ip_vs_sync.c
  409. @@ -1285,12 +1285,12 @@ static void set_sock_size(struct sock *sk, int mode, int val)
  410.     lock_sock(sk);
  411.     if (mode) {
  412.         val = clamp_t(int, val, (SOCK_MIN_SNDBUF + 1) / 2,
  413. -                 sysctl_wmem_max);
  414. +                 sock_net(sk)->core.sysctl_wmem_max);
  415.         sk->sk_sndbuf = val * 2;
  416.         sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
  417.     } else {
  418.         val = clamp_t(int, val, (SOCK_MIN_RCVBUF + 1) / 2,
  419. -                 sysctl_rmem_max);
  420. +                 sock_net(sk)->core.sysctl_rmem_max);
  421.         sk->sk_rcvbuf = val * 2;
  422.         sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
  423.     }
  424. --
  425. 2.14.3
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement